HDFS-14111. hdfsOpenFile on HDFS causes unnecessary IO from file offset 0. Contributed by Sahil Takiar.

Signed-off-by: Wei-Chiu Chuang <weichiu@apache.org>
2019-03-06 14:57:46 -08:00 · 2019-03-06 14:57:46 -08:00 · f5a4b43a49
parent 01ada40ea4
commit f5a4b43a49
8 changed files with 390 additions and 12 deletions
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
@ -740,6 +740,7 @@ public class CryptoInputStream extends FilterInputStream implements
    case StreamCapabilities.READAHEAD:
    case StreamCapabilities.DROPBEHIND:
    case StreamCapabilities.UNBUFFER:
+    case StreamCapabilities.READBYTEBUFFER:
      return true;
    default:
      return false;
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ByteBufferPositionedReadable.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ByteBufferPositionedReadable.java
@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/**
+ * Implementers of this interface provide a positioned read API that writes to a
+ * {@link ByteBuffer} rather than a {@code byte[]}.
+ *
+ * @see PositionedReadable
+ * @see ByteBufferReadable
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public interface ByteBufferPositionedReadable {
+  /**
+   * Reads up to {@code buf.remaining()} bytes into buf from a given position
+   * in the file and returns the number of bytes read. Callers should use
+   * {@code buf.limit(...)} to control the size of the desired read and
+   * {@code buf.position(...)} to control the offset into the buffer the data
+   * should be written to.
+   * <p>
+   * After a successful call, {@code buf.position()} will be advanced by the
+   * number of bytes read and {@code buf.limit()} should be unchanged.
+   * <p>
+   * In the case of an exception, the values of {@code buf.position()} and
+   * {@code buf.limit()} are undefined, and callers should be prepared to
+   * recover from this eventuality.
+   * <p>
+   * Many implementations will throw {@link UnsupportedOperationException}, so
+   * callers that are not confident in support for this method from the
+   * underlying filesystem should be prepared to handle that exception.
+   * <p>
+   * Implementations should treat 0-length requests as legitimate, and must not
+   * signal an error upon their receipt.
+   *
+   * @param position position within file
+   * @param buf the ByteBuffer to receive the results of the read operation.
+   * @return the number of bytes read, possibly zero, or -1 if reached
+   *         end-of-stream
+   * @throws IOException if there is some error performing the read
+   */
+  int read(long position, ByteBuffer buf) throws IOException;
+}
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java
@ -59,6 +59,12 @@ public interface StreamCapabilities {
   */
  String UNBUFFER = "in:unbuffer";

+  /**
+   * Stream read(ByteBuffer) capability implemented by
+   * {@link ByteBufferReadable#read(java.nio.ByteBuffer)}.
+   */
+  String READBYTEBUFFER = "in:readbytebuffer";
+
  /**
   * Capabilities that a stream can support and be queried for.
   */
--- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java
@ -1779,6 +1779,7 @@ public class DFSInputStream extends FSInputStream
    case StreamCapabilities.READAHEAD:
    case StreamCapabilities.DROPBEHIND:
    case StreamCapabilities.UNBUFFER:
+    case StreamCapabilities.READBYTEBUFFER:
      return true;
    default:
      return false;
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
@ -1013,7 +1013,7 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
       return f{is|os};
    */
    int accmode = flags & O_ACCMODE;
-    jstring jStrBufferSize = NULL, jStrReplication = NULL;
+    jstring jStrBufferSize = NULL, jStrReplication = NULL, jCapabilityString = NULL;
    jobject jConfiguration = NULL, jPath = NULL, jFile = NULL;
    jobject jFS = (jobject)fs;
    jthrowable jthr;
@ -1171,16 +1171,22 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
    file->flags = 0;

    if ((flags & O_WRONLY) == 0) {
-        // Try a test read to see if we can do direct reads
-        char buf;
-        if (readDirect(fs, file, &buf, 0) == 0) {
-            // Success - 0-byte read should return 0
+        // Check the StreamCapabilities of jFile to see if we can do direct reads
+        jthr = newJavaStr(env, "in:readbytebuffer", &jCapabilityString);
+        if (jthr) {
+            ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                                        "hdfsOpenFile(%s): newJavaStr", path);
+            goto done;
+        }
+        jthr = invokeMethod(env, &jVal, INSTANCE, jFile, HADOOP_ISTRM,
+                            "hasCapability", "(Ljava/lang/String;)Z", jCapabilityString);
+        if (jthr) {
+            ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                                        "hdfsOpenFile(%s): FSDataInputStream#hasCapability", path);
+            goto done;
+        }
+        if (jVal.z) {
            file->flags |= HDFS_FILE_SUPPORTS_DIRECT_READ;
-        } else if (errno != ENOTSUP) {
-            // Unexpected error. Clear it, don't set the direct flag.
-            fprintf(stderr,
-                  "hdfsOpenFile(%s): WARN: Unexpected error %d when testing "
-                  "for direct read compatibility\n", path, errno);
        }
    }
    ret = 0;
@ -1191,6 +1197,7 @@ done:
    destroyLocalReference(env, jConfiguration); 
    destroyLocalReference(env, jPath); 
    destroyLocalReference(env, jFile);
+    destroyLocalReference(env, jCapabilityString);
    if (ret) {
        if (file) {
            if (file->file) {
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_ext_test.cc
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_ext_test.cc
@ -503,7 +503,10 @@ TEST_F(HdfsExtTest, TestReadStats) {
  hdfsFileFreeReadStatistics(stats);

  EXPECT_EQ(0, hdfsCloseFile(fs, file));
-  EXPECT_EQ(0, errno);
+  // Since libhdfs is not guaranteed to set errno to 0 on successful
+  // operations, we disable this check for now, see HDFS-14325 for a
+  // long term solution to this problem
+  // EXPECT_EQ(0, errno);
 }

 //Testing working directory
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/AddBlockPoolException.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/AddBlockPoolException.java
@ -0,0 +1,27 @@
+package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
+
+/**
+ * This exception collects all IOExceptions thrown when adding block pools and
+ * scanning volumes. It keeps the information about which volume is associated
+ * with an exception.
+ *
+ */
+public class AddBlockPoolException extends IOException {
+  private Map<FsVolumeSpi, IOException> unhealthyDataDirs;
+  public AddBlockPoolException(Map<FsVolumeSpi, IOException>
+      unhealthyDataDirs) {
+    this.unhealthyDataDirs = unhealthyDataDirs;
+  }
+
+  public Map<FsVolumeSpi, IOException> getFailingVolumes() {
+    return unhealthyDataDirs;
+  }
+  @Override
+  public String toString() {
+    return getClass().getName() + ": " + unhealthyDataDirs.toString();
+  }
+}
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestByteBufferPread.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestByteBufferPread.java
@ -0,0 +1,269 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Random;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * This class tests the DFS positional read functionality on a single node
+ * mini-cluster. These tests are inspired from {@link TestPread}. The tests
+ * are much less comprehensive than other pread tests because pread already
+ * internally uses {@link ByteBuffer}s.
+ */
+public class TestByteBufferPread {
+
+  private static MiniDFSCluster cluster;
+  private static FileSystem fs;
+  private static byte[] fileContents;
+  private static Path testFile;
+  private static Random rand;
+
+  private static final long SEED = 0xDEADBEEFL;
+  private static final int BLOCK_SIZE = 4096;
+  private static final int FILE_SIZE = 12 * BLOCK_SIZE;
+
+  @BeforeClass
+  public static void setup() throws IOException {
+    // Setup the cluster with a small block size so we can create small files
+    // that span multiple blocks
+    Configuration conf = new Configuration();
+    conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
+    cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+    fs = cluster.getFileSystem();
+
+    // Create a test file that spans 12 blocks, and contains a bunch of random
+    // bytes
+    fileContents = new byte[FILE_SIZE];
+    rand = new Random(SEED);
+    rand.nextBytes(fileContents);
+    testFile = new Path("/byte-buffer-pread-test.dat");
+    try (FSDataOutputStream out = fs.create(testFile, (short) 3)) {
+      out.write(fileContents);
+    }
+  }
+
+  /**
+   * Test preads with {@link java.nio.HeapByteBuffer}s.
+   */
+  @Test
+  public void testPreadWithHeapByteBuffer() throws IOException {
+    testPreadWithByteBuffer(ByteBuffer.allocate(FILE_SIZE));
+    testPreadWithFullByteBuffer(ByteBuffer.allocate(FILE_SIZE));
+    testPreadWithPositionedByteBuffer(ByteBuffer.allocate(FILE_SIZE));
+    testPreadWithLimitedByteBuffer(ByteBuffer.allocate(FILE_SIZE));
+    testPositionedPreadWithByteBuffer(ByteBuffer.allocate(FILE_SIZE));
+  }
+
+  /**
+   * Test preads with {@link java.nio.DirectByteBuffer}s.
+   */
+  @Test
+  public void testPreadWithDirectByteBuffer() throws IOException {
+    testPreadWithByteBuffer(ByteBuffer.allocateDirect(FILE_SIZE));
+    testPreadWithFullByteBuffer(ByteBuffer.allocateDirect(FILE_SIZE));
+    testPreadWithPositionedByteBuffer(ByteBuffer.allocateDirect(FILE_SIZE));
+    testPreadWithLimitedByteBuffer(ByteBuffer.allocateDirect(FILE_SIZE));
+    testPositionedPreadWithByteBuffer(ByteBuffer.allocateDirect(FILE_SIZE));
+  }
+
+  /**
+   * Reads the entire testFile using the pread API and validates that its
+   * contents are properly loaded into the supplied {@link ByteBuffer}.
+   */
+  private void testPreadWithByteBuffer(ByteBuffer buffer) throws IOException {
+    int bytesRead;
+    int totalBytesRead = 0;
+    try (FSDataInputStream in = fs.open(testFile)) {
+      while ((bytesRead = in.read(totalBytesRead, buffer)) > 0) {
+        totalBytesRead += bytesRead;
+        // Check that each call to read changes the position of the ByteBuffer
+        // correctly
+        assertEquals(totalBytesRead, buffer.position());
+      }
+
+      // Make sure the buffer is full
+      assertFalse(buffer.hasRemaining());
+      // Make sure the contents of the read buffer equal the contents of the
+      // file
+      buffer.position(0);
+      byte[] bufferContents = new byte[FILE_SIZE];
+      buffer.get(bufferContents);
+      assertArrayEquals(bufferContents, fileContents);
+      buffer.position(buffer.limit());
+    }
+  }
+
+  /**
+   * Attempts to read the testFile into a {@link ByteBuffer} that is already
+   * full, and validates that doing so does not change the contents of the
+   * supplied {@link ByteBuffer}.
+   */
+  private void testPreadWithFullByteBuffer(ByteBuffer buffer)
+          throws IOException {
+    // Load some dummy data into the buffer
+    byte[] existingBufferBytes = new byte[FILE_SIZE];
+    rand.nextBytes(existingBufferBytes);
+    buffer.put(existingBufferBytes);
+    // Make sure the buffer is full
+    assertFalse(buffer.hasRemaining());
+
+    try (FSDataInputStream in = fs.open(testFile)) {
+      // Attempt to read into the buffer, 0 bytes should be read since the
+      // buffer is full
+      assertEquals(0, in.read(buffer));
+
+      // Double check the buffer is still full and its contents have not
+      // changed
+      assertFalse(buffer.hasRemaining());
+      buffer.position(0);
+      byte[] bufferContents = new byte[FILE_SIZE];
+      buffer.get(bufferContents);
+      assertArrayEquals(bufferContents, existingBufferBytes);
+    }
+  }
+
+  /**
+   * Reads half of the testFile into the {@link ByteBuffer} by setting a
+   * {@link ByteBuffer#limit} on the buffer. Validates that only half of the
+   * testFile is loaded into the buffer.
+   */
+  private void testPreadWithLimitedByteBuffer(
+          ByteBuffer buffer) throws IOException {
+    int bytesRead;
+    int totalBytesRead = 0;
+    // Set the buffer limit to half the size of the file
+    buffer.limit(FILE_SIZE / 2);
+
+    try (FSDataInputStream in = fs.open(testFile)) {
+      while ((bytesRead = in.read(totalBytesRead, buffer)) > 0) {
+        totalBytesRead += bytesRead;
+        // Check that each call to read changes the position of the ByteBuffer
+        // correctly
+        assertEquals(totalBytesRead, buffer.position());
+      }
+
+      // Since we set the buffer limit to half the size of the file, we should
+      // have only read half of the file into the buffer
+      assertEquals(totalBytesRead, FILE_SIZE / 2);
+      // Check that the buffer is full and the contents equal the first half of
+      // the file
+      assertFalse(buffer.hasRemaining());
+      buffer.position(0);
+      byte[] bufferContents = new byte[FILE_SIZE / 2];
+      buffer.get(bufferContents);
+      assertArrayEquals(bufferContents,
+              Arrays.copyOfRange(fileContents, 0, FILE_SIZE / 2));
+    }
+  }
+
+  /**
+   * Reads half of the testFile into the {@link ByteBuffer} by setting the
+   * {@link ByteBuffer#position} the half the size of the file. Validates that
+   * only half of the testFile is loaded into the buffer.
+   */
+  private void testPreadWithPositionedByteBuffer(
+          ByteBuffer buffer) throws IOException {
+    int bytesRead;
+    int totalBytesRead = 0;
+    // Set the buffer position to half the size of the file
+    buffer.position(FILE_SIZE / 2);
+
+    try (FSDataInputStream in = fs.open(testFile)) {
+      while ((bytesRead = in.read(totalBytesRead, buffer)) > 0) {
+        totalBytesRead += bytesRead;
+        // Check that each call to read changes the position of the ByteBuffer
+        // correctly
+        assertEquals(totalBytesRead + FILE_SIZE / 2, buffer.position());
+      }
+
+      // Since we set the buffer position to half the size of the file, we
+      // should have only read half of the file into the buffer
+      assertEquals(totalBytesRead, FILE_SIZE / 2);
+      // Check that the buffer is full and the contents equal the first half of
+      // the file
+      assertFalse(buffer.hasRemaining());
+      buffer.position(FILE_SIZE / 2);
+      byte[] bufferContents = new byte[FILE_SIZE / 2];
+      buffer.get(bufferContents);
+      assertArrayEquals(bufferContents,
+              Arrays.copyOfRange(fileContents, 0, FILE_SIZE / 2));
+    }
+  }
+
+  /**
+   * Reads half of the testFile into the {@link ByteBuffer} by specifying a
+   * position for the pread API that is half of the file size. Validates that
+   * only half of the testFile is loaded into the buffer.
+   */
+  private void testPositionedPreadWithByteBuffer(
+          ByteBuffer buffer) throws IOException {
+    int bytesRead;
+    int totalBytesRead = 0;
+
+    try (FSDataInputStream in = fs.open(testFile)) {
+      // Start reading from halfway through the file
+      while ((bytesRead = in.read(totalBytesRead + FILE_SIZE / 2,
+              buffer)) > 0) {
+        totalBytesRead += bytesRead;
+        // Check that each call to read changes the position of the ByteBuffer
+        // correctly
+        assertEquals(totalBytesRead, buffer.position());
+      }
+
+      // Since we starting reading halfway through the file, the buffer should
+      // only be half full
+      assertEquals(totalBytesRead, FILE_SIZE / 2);
+      assertEquals(buffer.position(), FILE_SIZE / 2);
+      assertTrue(buffer.hasRemaining());
+      // Check that the buffer contents equal the second half of the file
+      buffer.position(0);
+      byte[] bufferContents = new byte[FILE_SIZE / 2];
+      buffer.get(bufferContents);
+      assertArrayEquals(bufferContents,
+              Arrays.copyOfRange(fileContents, FILE_SIZE / 2, FILE_SIZE));
+    }
+  }
+
+  @AfterClass
+  public static void shutdown() throws IOException {
+    try {
+      fs.delete(testFile, false);
+      fs.close();
+    } finally {
+      cluster.shutdown(true);
+    }
+  }
+}