LUCENE-2537: merging from 3x

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@979185 13f79535-47bb-0310-9956-ffa450edef68
2010-07-26 07:23:12 +00:00 · 2010-07-26 07:23:12 +00:00 · 42634c54f7
parent 6487d5caf2
commit 42634c54f7
4 changed files with 86 additions and 56 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -447,6 +447,9 @@ Bug fixes
  explicitly enable the old behavior with setAutoGeneratePhraseQueries(true) 
  (Robert Muir)
  
+* LUCENE-2537: FSDirectory.copy() implementation was unsafe and could result in 
+  OOM if a large file was copied. (Shai Erera)
+
 New features

 * LUCENE-2128: Parallelized fetching document frequencies during weight
--- a/lucene/src/java/org/apache/lucene/index/CompoundFileWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/CompoundFileWriter.java
@ -20,12 +20,13 @@ package org.apache.lucene.index;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.IOUtils;
+
 import java.util.LinkedList;
 import java.util.HashSet;

 import java.io.IOException;

-
 /**
 * Combines multiple files into a single compound file.
 * The file format:<br>
@ -142,20 +143,17 @@ final class CompoundFileWriter {
     */
    public void close() throws IOException {
        if (merged)
-            throw new IllegalStateException(
-                "Merge already performed");
+            throw new IllegalStateException("Merge already performed");

        if (entries.isEmpty())
-            throw new IllegalStateException(
-                "No entries to merge have been defined");
+            throw new IllegalStateException("No entries to merge have been defined");

        merged = true;

        // open the compound stream
-        IndexOutput os = null;
+        IndexOutput os = directory.createOutput(fileName);
+        IOException priorException = null;
        try {
-            os = directory.createOutput(fileName);
-
            // Write the Version info - must be a VInt because CFR reads a VInt
            // in older versions!
            os.writeVInt(FORMAT_CURRENT);
@ -185,10 +183,9 @@ final class CompoundFileWriter {

            // Open the files and copy their data into the stream.
            // Remember the locations of each file's data section.
-            byte buffer[] = new byte[16384];
            for (FileEntry fe : entries) {
                fe.dataOffset = os.getFilePointer();
-                copyFile(fe, os, buffer);
+                copyFile(fe, os);
            }

            // Write the data offsets into the directory of the compound stream
@ -206,56 +203,37 @@ final class CompoundFileWriter {
            IndexOutput tmp = os;
            os = null;
            tmp.close();
-
+        } catch (IOException e) {
+          priorException = e;
        } finally {
-            if (os != null) try { os.close(); } catch (IOException e) { }
+          IOUtils.closeSafely(priorException, os);
        }
    }

-    /** Copy the contents of the file with specified extension into the
-     *  provided output stream. Use the provided buffer for moving data
-     *  to reduce memory allocation.
-     */
-    private void copyFile(FileEntry source, IndexOutput os, byte buffer[])
-    throws IOException
-    {
-        IndexInput is = null;
-        try {
-            long startPtr = os.getFilePointer();
+  /**
+   * Copy the contents of the file with specified extension into the provided
+   * output stream.
+   */
+  private void copyFile(FileEntry source, IndexOutput os) throws IOException {
+    IndexInput is = directory.openInput(source.file);
+    try {
+      long startPtr = os.getFilePointer();
+      long length = is.length();
+      os.copyBytes(is, length);

-            is = directory.openInput(source.file);
-            long length = is.length();
-            long remainder = length;
-            int chunk = buffer.length;
+      if (checkAbort != null) {
+        checkAbort.work(length);
+      }

-            while(remainder > 0) {
-                int len = (int) Math.min(chunk, remainder);
-                is.readBytes(buffer, 0, len, false);
-                os.writeBytes(buffer, len);
-                remainder -= len;
-                if (checkAbort != null)
-                  // Roughly every 2 MB we will check if
-                  // it's time to abort
-                  checkAbort.work(80);
-            }
+      // Verify that the output length diff is equal to original file
+      long endPtr = os.getFilePointer();
+      long diff = endPtr - startPtr;
+      if (diff != length)
+        throw new IOException("Difference in the output file offsets " + diff
+            + " does not match the original file length " + length);

-            // Verify that remainder is 0
-            if (remainder != 0)
-                throw new IOException(
-                    "Non-zero remainder length after copying: " + remainder
-                    + " (id: " + source.file + ", length: " + length
-                    + ", buffer size: " + chunk + ")");
-
-            // Verify that the output length diff is equal to original file
-            long endPtr = os.getFilePointer();
-            long diff = endPtr - startPtr;
-            if (diff != length)
-                throw new IOException(
-                    "Difference in the output file offsets " + diff
-                    + " does not match the original file length " + length);
-
-        } finally {
-            if (is != null) is.close();
-        }
+    } finally {
+      is.close();
    }
+  }
 }
--- a/lucene/src/java/org/apache/lucene/store/FSDirectory.java
+++ b/lucene/src/java/org/apache/lucene/store/FSDirectory.java
@ -33,6 +33,8 @@ import java.util.Collections;
 import static java.util.Collections.synchronizedSet;
 import java.util.HashSet;
 import java.util.Set;
+
+import org.apache.lucene.store.SimpleFSDirectory.SimpleFSIndexInput;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.ThreadInterruptedException;
 import org.apache.lucene.util.Constants;
@ -126,6 +128,12 @@ public abstract class FSDirectory extends Directory {
  protected final Set<String> staleFiles = synchronizedSet(new HashSet<String>()); // Files written, but not yet sync'ed
  private int chunkSize = DEFAULT_READ_CHUNK_SIZE; // LUCENE-1566

+  /**
+   * Chunk size used to read when using FileChannel API. If an attempt to read a
+   * large file is made without limiting the chunk size, an OOM may occur.
+   */
+  private static final long CHANNEL_CHUNK_SIZE = 1 << 21; // Use 2MB chunk size - LUCENE-2537
+
  // returns the canonical version of the directory, creating it if it doesn't exist.
  private static File getCanonicalPath(File file) throws IOException {
    return new File(file.getCanonicalPath());
@ -441,7 +449,7 @@ public abstract class FSDirectory extends Directory {
      try {
        input = new FileInputStream(new File(directory, src)).getChannel();
        output = new FileOutputStream(new File(target.directory, dest)).getChannel();
-        output.transferFrom(input, 0, input.size());
+        copy(input, output, input.size());
      } catch (IOException ioe) {
        priorException = ioe;
      } finally {
@ -451,6 +459,25 @@ public abstract class FSDirectory extends Directory {
      super.copy(to, src, dest);
    }
  }
+
+  /**
+   * Copies the content of a given {@link FileChannel} to a destination one. The
+   * copy is done in chunks of 2MB because if transferFrom is used without a
+   * limit when copying a very large file, then an OOM may be thrown (depends on
+   * the state of the RAM in the machine, as well as the OS used). Performance
+   * measurements showed that chunk sizes larger than 2MB do not result in much
+   * faster file copy, therefore we limit the size to be safe with different
+   * file sizes and systems.
+   */
+  static void copy(FileChannel input, FileChannel output, long numBytes) throws IOException {
+    long pos = output.position();
+    long writeTo = numBytes + pos;
+    while (pos < writeTo) {
+      pos += output.transferFrom(input, pos, Math.min(CHANNEL_CHUNK_SIZE, writeTo - pos));
+    }
+    // transferFrom does not change the position of the channel. Need to change it manually
+    output.position(pos);
+  }
  
  protected static class FSIndexOutput extends BufferedIndexOutput {
    private final FSDirectory parent;
@ -471,6 +498,28 @@ public abstract class FSDirectory extends Directory {
      file.write(b, offset, size);
    }

+    @Override
+    public void copyBytes(DataInput input, long numBytes) throws IOException {
+      // Optimized copy only if the number of bytes to copy is larger than the
+      // buffer size, and the given IndexInput supports FileChannel copying ..
+      // NOTE: the below check relies on NIOIndexInput extending Simple. If that
+      // changes in the future, we should change the check as well.
+      if (numBytes > BUFFER_SIZE && input instanceof SimpleFSIndexInput) {
+        // flush any bytes in the buffer
+        flush();
+        // do the optimized copy
+        FileChannel in = ((SimpleFSIndexInput) input).file.getChannel();
+        FileChannel out = file.getChannel();
+        copy(in, out, numBytes);
+        // corrects the position in super (BufferedIndexOutput), so that calls
+        // to getFilePointer will return the correct pointer.
+        // Perhaps a specific method is better?
+        super.seek(out.position());
+      } else {
+        super.copyBytes(input, numBytes);
+      }
+    }
+    
    @Override
    public void close() throws IOException {
      // only close the file if it has not been closed yet
--- a/lucene/src/java/org/apache/lucene/store/SimpleFSDirectory.java
+++ b/lucene/src/java/org/apache/lucene/store/SimpleFSDirectory.java
@ -125,7 +125,7 @@ public class SimpleFSDirectory extends FSDirectory {
          final OutOfMemoryError outOfMemoryError = new OutOfMemoryError(
              "OutOfMemoryError likely caused by the Sun VM Bug described in "
              + "https://issues.apache.org/jira/browse/LUCENE-1566; try calling FSDirectory.setReadChunkSize "
-              + "with a a value smaller than the current chunks size (" + chunkSize + ")");
+              + "with a value smaller than the current chunks size (" + chunkSize + ")");
          outOfMemoryError.initCause(e);
          throw outOfMemoryError;
        }