new IndexWriter.addIndexesNoOptimize(): LUCENE-528

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@468177 13f79535-47bb-0310-9956-ffa450edef68
2006-10-26 22:47:15 +00:00 · 2006-10-26 22:47:15 +00:00 · 4dfdd6a0af
parent 19bf841c27
commit 4dfdd6a0af
3 changed files with 156 additions and 8 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -39,6 +39,10 @@ New features
 5. LUCENE-544: Added the ability to specify different boosts for different
    fields when using MultiFieldQueryParser (Matt Ericson via Otis Gospodnetic)

+ 6. LUCENE-528: New IndexWriter.addIndexesNoOptimize() that doesn't optimize the
+    index when adding new segments, only performing merges as needed.
+    (Ning Li via Yonik Seeley)
+
 API Changes

 1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
--- a/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/src/java/org/apache/lucene/index/IndexWriter.java
@ -632,6 +632,130 @@ public class IndexWriter {
    optimize();					  // final cleanup
  }

+  /**
+   * Merges all segments from an array of indexes into this index.
+   * <p>
+   * This is similar to addIndexes(Directory[]). However, no optimize()
+   * is called either at the beginning or at the end. Instead, merges
+   * are carried out as necessary.
+   * <p>
+   * This requires this index not be among those to be added, and the
+   * upper bound* of those segment doc counts not exceed maxMergeDocs.
+   */
+  public synchronized void addIndexesNoOptimize(Directory[] dirs)
+      throws IOException {
+    // Adding indexes can be viewed as adding a sequence of segments S to
+    // a sequence of segments T. Segments in T follow the invariants but
+    // segments in S may not since they could come from multiple indexes.
+    // Here is the merge algorithm for addIndexesNoOptimize():
+    //
+    // 1 Flush ram segments.
+    // 2 Consider a combined sequence with segments from T followed
+    //   by segments from S (same as current addIndexes(Directory[])).
+    // 3 Assume the highest level for segments in S is h. Call
+    //   maybeMergeSegments(), but instead of starting w/ lowerBound = -1
+    //   and upperBound = maxBufferedDocs, start w/ lowerBound = -1 and
+    //   upperBound = upperBound of level h. After this, the invariants
+    //   are guaranteed except for the last < M segments whose levels <= h.
+    // 4 If the invariants hold for the last < M segments whose levels <= h,
+    //   if some of those < M segments are from S (not merged in step 3),
+    //   properly copy them over*, otherwise done.
+    //   Otherwise, simply merge those segments. If the merge results in
+    //   a segment of level <= h, done. Otherwise, it's of level h+1 and call
+    //   maybeMergeSegments() starting w/ upperBound = upperBound of level h+1.
+    //
+    // * Ideally, we want to simply copy a segment. However, directory does
+    // not support copy yet. In addition, source may use compound file or not
+    // and target may use compound file or not. So we use mergeSegments() to
+    // copy a segment, which may cause doc count to change because deleted
+    // docs are garbage collected.
+    //
+    // In current addIndexes(Directory[]), segment infos in S are added to
+    // T's "segmentInfos" upfront. Then segments in S are merged to T several
+    // at a time. Every merge is committed with T's "segmentInfos". So if
+    // a reader is opened on T while addIndexes() is going on, it could see
+    // an inconsistent index. AddIndexesNoOptimize() has a similar behaviour.
+
+    // 1 flush ram segments
+    flushRamSegments();
+
+    // 2 copy segment infos and find the highest level from dirs
+    int start = segmentInfos.size();
+    int startUpperBound = minMergeDocs;
+
+    try {
+      for (int i = 0; i < dirs.length; i++) {
+        if (directory == dirs[i]) {
+          // cannot add this index: segments may be deleted in merge before added
+          throw new IllegalArgumentException("Cannot add this index to itself");
+        }
+
+        SegmentInfos sis = new SegmentInfos(); // read infos from dir
+        sis.read(dirs[i]);
+        for (int j = 0; j < sis.size(); j++) {
+          SegmentInfo info = sis.info(j);
+          segmentInfos.addElement(info); // add each info
+
+          while (startUpperBound < info.docCount) {
+            startUpperBound *= mergeFactor; // find the highest level from dirs
+            if (startUpperBound > maxMergeDocs) {
+              // upper bound cannot exceed maxMergeDocs
+              throw new IllegalArgumentException("Upper bound cannot exceed maxMergeDocs");
+            }
+          }
+        }
+      }
+    } catch (IllegalArgumentException e) {
+      for (int i = segmentInfos.size() - 1; i >= start; i--) {
+        segmentInfos.remove(i);
+      }
+      throw e;
+    }
+
+    // 3 maybe merge segments starting from the highest level from dirs
+    maybeMergeSegments(startUpperBound);
+
+    // get the tail segments whose levels <= h
+    int segmentCount = segmentInfos.size();
+    int numTailSegments = 0;
+    while (numTailSegments < segmentCount
+        && startUpperBound >= segmentInfos.info(segmentCount - 1 - numTailSegments).docCount) {
+      numTailSegments++;
+    }
+    if (numTailSegments == 0) {
+      return;
+    }
+
+    // 4 make sure invariants hold for the tail segments whose levels <= h
+    if (checkNonDecreasingLevels(segmentCount - numTailSegments)) {
+      // identify the segments from S to be copied (not merged in 3)
+      int numSegmentsToCopy = 0;
+      while (numSegmentsToCopy < segmentCount
+          && directory != segmentInfos.info(segmentCount - 1 - numSegmentsToCopy).dir) {
+        numSegmentsToCopy++;
+      }
+      if (numSegmentsToCopy == 0) {
+        return;
+      }
+
+      // copy those segments from S
+      for (int i = segmentCount - numSegmentsToCopy; i < segmentCount; i++) {
+        mergeSegments(segmentInfos, i, i + 1);
+      }
+      if (checkNonDecreasingLevels(segmentCount - numSegmentsToCopy)) {
+        return;
+      }
+    }
+
+    // invariants do not hold, simply merge those segments
+    mergeSegments(segmentInfos, segmentCount - numTailSegments, segmentCount);
+
+    // maybe merge segments again if necessary
+    if (segmentInfos.info(segmentInfos.size() - 1).docCount > startUpperBound) {
+      maybeMergeSegments(startUpperBound * mergeFactor);
+    }
+  }
+
  /** Merges the provided indexes into this index.
   * <p>After this completes, the index is optimized. </p>
   * <p>The provided IndexReaders are not closed.</p>
@ -735,16 +859,16 @@ public class IndexWriter {
  private final void flushRamSegments() throws IOException {
    if (ramSegmentInfos.size() > 0) {
      mergeSegments(ramSegmentInfos, 0, ramSegmentInfos.size());
-      maybeMergeSegments();
+      maybeMergeSegments(minMergeDocs);
    }
  }

  /** Incremental segment merger.  */
-  private final void maybeMergeSegments() throws IOException {
+  private final void maybeMergeSegments(int startUpperBound) throws IOException {
    long lowerBound = -1;
-    long upperBound = minMergeDocs;
+    long upperBound = startUpperBound;

-    while (upperBound * mergeFactor <= maxMergeDocs) {
+    while (upperBound < maxMergeDocs) {
      int minSegment = segmentInfos.size();
      int maxSegment = -1;

@ -949,4 +1073,22 @@ public class IndexWriter {
    }
    directory.renameFile("deleteable.new", IndexFileNames.DELETABLE);
  }
+
+  private final boolean checkNonDecreasingLevels(int start) {
+    int lowerBound = -1;
+    int upperBound = minMergeDocs;
+
+    for (int i = segmentInfos.size() - 1; i >= start; i--) {
+      int docCount = segmentInfos.info(i).docCount;
+      if (docCount <= lowerBound) {
+        return false;
+      }
+
+      while (docCount > upperBound) {
+        lowerBound = upperBound;
+        upperBound *= mergeFactor;
+      }
+    }
+    return true;
+  }
 }
--- a/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
+++ b/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
@ -181,14 +181,14 @@ public class TestIndexWriterMergePolicy extends TestCase {
    int ramSegmentCount = writer.getRAMSegmentCount();
    assertTrue(ramSegmentCount < maxBufferedDocs);

-    int lowerBound = 0;
+    int lowerBound = -1;
    int upperBound = maxBufferedDocs;
    int numSegments = 0;

    int segmentCount = writer.getSegmentCount();
    for (int i = segmentCount - 1; i >= 0; i--) {
      int docCount = writer.getDocCount(i);
-      assertTrue(docCount > lowerBound || docCount == 0);
+      assertTrue(docCount > lowerBound);

      if (docCount <= upperBound) {
        numSegments++;
@ -197,8 +197,10 @@ public class TestIndexWriterMergePolicy extends TestCase {
          assertTrue(numSegments < mergeFactor);
        }

-        lowerBound = upperBound;
-        upperBound *= mergeFactor;
+        do {
+          lowerBound = upperBound;
+          upperBound *= mergeFactor;
+        } while (docCount > upperBound);
        numSegments = 1;
      }
    }