LUCENE-1120: speed up merging of term vectors by bulk-copying the raw bytes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@615183 13f79535-47bb-0310-9956-ffa450edef68
2008-01-25 11:32:32 +00:00 · 2008-01-25 11:32:32 +00:00 · 76dfb92a44
parent ee835ccf21
commit 76dfb92a44
19 changed files with 331 additions and 148 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -1,4 +1,4 @@
-Lucene Change Log
+Lucene Change Log
 $Id$

 ======================= Trunk (not yet released) =======================
@ -32,6 +32,10 @@ Optimizations
    disk full situation before actually filling up the disk.  (Mike
    McCandless)

+ 2. LUCENE-1120: Speed up merging of term vectors by bulk-copying the
+    raw bytes for each contiguous range of non-deleted documents.
+    (Mike McCandless)
+
 Documentation

 Build
--- a/src/java/org/apache/lucene/index/DocumentsWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriter.java
@ -662,12 +662,12 @@ final class DocumentsWriter {
        // Append term vectors to the real outputs:
        if (tvx != null) {
          tvx.writeLong(tvd.getFilePointer());
+          tvx.writeLong(tvf.getFilePointer());
          tvd.writeVInt(numVectorFields);
          if (numVectorFields > 0) {
            for(int i=0;i<numVectorFields;i++)
              tvd.writeVInt(vectorFieldNumbers[i]);
            assert 0 == vectorFieldPointers[0];
-            tvd.writeVLong(tvf.getFilePointer());
            long lastPos = vectorFieldPointers[0];
            for(int i=1;i<numVectorFields;i++) {
              long pos = vectorFieldPointers[i];
@ -870,22 +870,23 @@ final class DocumentsWriter {
          // state:
          try {
            tvx = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
-            tvx.writeInt(TermVectorsReader.FORMAT_VERSION);
+            tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
            tvd = directory.createOutput(docStoreSegment +  "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
-            tvd.writeInt(TermVectorsReader.FORMAT_VERSION);
+            tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
            tvf = directory.createOutput(docStoreSegment +  "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
-            tvf.writeInt(TermVectorsReader.FORMAT_VERSION);
+            tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);

            // We must "catch up" for all docIDs that had no
            // vectors before this one
-            for(int i=0;i<docID;i++)
+            for(int i=0;i<docID;i++) {
              tvx.writeLong(0);
+              tvx.writeLong(0);
+            }
          } catch (Throwable t) {
            throw new AbortException(t, DocumentsWriter.this);
          }
          files = null;
        }
-
        numVectorFields = 0;
      }
    }
--- a/src/java/org/apache/lucene/index/SegmentMerger.java
+++ b/src/java/org/apache/lucene/index/SegmentMerger.java
@ -205,6 +205,38 @@ final class SegmentMerger {
    }
  }

+  private SegmentReader[] matchingSegmentReaders;
+  private int[] rawDocLengths;
+  private int[] rawDocLengths2;
+
+  private void setMatchingSegmentReaders() {
+    // If the i'th reader is a SegmentReader and has
+    // identical fieldName -> number mapping, then this
+    // array will be non-null at position i:
+    matchingSegmentReaders = new SegmentReader[readers.size()];
+
+    // If this reader is a SegmentReader, and all of its
+    // field name -> number mappings match the "merged"
+    // FieldInfos, then we can do a bulk copy of the
+    // stored fields:
+    for (int i = 0; i < readers.size(); i++) {
+      IndexReader reader = (IndexReader) readers.elementAt(i);
+      if (reader instanceof SegmentReader) {
+        SegmentReader segmentReader = (SegmentReader) reader;
+        boolean same = true;
+        FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
+        for (int j = 0; same && j < segmentFieldInfos.size(); j++)
+          same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
+        if (same)
+          matchingSegmentReaders[i] = segmentReader;
+      }
+    }
+
+    // Used for bulk-reading raw bytes for stored fields
+    rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
+    rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS];
+  }
+
  /**
   * 
   * @return The number of documents in all of the readers
@ -248,34 +280,10 @@ final class SegmentMerger {

    int docCount = 0;

+    setMatchingSegmentReaders();
+
    if (mergeDocStores) {

-      // If the i'th reader is a SegmentReader and has
-      // identical fieldName -> number mapping, then this
-      // array will be non-null at position i:
-      SegmentReader[] matchingSegmentReaders = new SegmentReader[readers.size()];
-
-      // If this reader is a SegmentReader, and all of its
-      // field name -> number mappings match the "merged"
-      // FieldInfos, then we can do a bulk copy of the
-      // stored fields:
-      for (int i = 0; i < readers.size(); i++) {
-        IndexReader reader = (IndexReader) readers.elementAt(i);
-        if (reader instanceof SegmentReader) {
-          SegmentReader segmentReader = (SegmentReader) reader;
-          boolean same = true;
-          FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
-          for (int j = 0; same && j < segmentFieldInfos.size(); j++)
-            same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
-          if (same) {
-            matchingSegmentReaders[i] = segmentReader;
-          }
-        }
-      }
-	
-      // Used for bulk-reading raw bytes for stored fields
-      final int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
-
      // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
      // in  merge mode, we use this FieldSelector
      FieldSelector fieldSelectorMerge = new FieldSelector() {
@ -350,15 +358,45 @@ final class SegmentMerger {

    try {
      for (int r = 0; r < readers.size(); r++) {
+        final SegmentReader matchingSegmentReader = matchingSegmentReaders[r];
+        TermVectorsReader matchingVectorsReader;
+        if (matchingSegmentReader != null) {
+          matchingVectorsReader = matchingSegmentReader.termVectorsReaderOrig;
+
+          // If the TV* files are an older format then they
+          // cannot read raw docs:
+          if (matchingVectorsReader != null && !matchingVectorsReader.canReadRawDocs())
+            matchingVectorsReader = null;
+        } else
+          matchingVectorsReader = null;
        IndexReader reader = (IndexReader) readers.elementAt(r);
        int maxDoc = reader.maxDoc();
-        for (int docNum = 0; docNum < maxDoc; docNum++) {
+        for (int docNum = 0; docNum < maxDoc;) {
          // skip deleted docs
-          if (reader.isDeleted(docNum)) 
-            continue;
-          termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum));
-          if (checkAbort != null)
-            checkAbort.work(300);
+          if (!reader.isDeleted(docNum)) {
+            if (matchingVectorsReader != null) {
+              // We can optimize this case (doing a bulk
+              // byte copy) since the field numbers are
+              // identical
+              int start = docNum;
+              int numDocs = 0;
+              do {
+                docNum++;
+                numDocs++;
+              } while(docNum < maxDoc && !matchingSegmentReader.isDeleted(docNum) && numDocs < MAX_RAW_MERGE_DOCS);
+
+              matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
+              termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
+              if (checkAbort != null)
+                checkAbort.work(300*numDocs);
+            } else {
+              termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum));
+              docNum++;
+              if (checkAbort != null)
+                checkAbort.work(300);
+            }
+          } else
+            docNum++;
        }
      }
    } finally {
--- a/src/java/org/apache/lucene/index/TermVectorsReader.java
+++ b/src/java/org/apache/lucene/index/TermVectorsReader.java
@ -22,13 +22,18 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;

 import java.io.IOException;
+import java.util.Arrays;

 /**
 * @version $Id$
 */
 class TermVectorsReader implements Cloneable {

+  // NOTE: if you make a new format, it must be larger than
+  // the current format
  static final int FORMAT_VERSION = 2;
+  static final int FORMAT_VERSION2 = 3;
+
  //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file 
  static final int FORMAT_SIZE = 4;

@ -41,13 +46,13 @@ class TermVectorsReader implements Cloneable {
  private IndexInput tvd;
  private IndexInput tvf;
  private int size;
+  private int numTotalDocs;

  // The docID offset where our docs begin in the index
  // file.  This will be 0 if we have our own private file.
  private int docStoreOffset;
  
-  private int tvdFormat;
-  private int tvfFormat;
+  private final int format;

  TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
    throws CorruptIndexException, IOException {
@ -56,7 +61,7 @@ class TermVectorsReader implements Cloneable {

  TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize)
    throws CorruptIndexException, IOException {
-    this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE, -1, 0);
+    this(d, segment, fieldInfos, readBufferSize, -1, 0);
  }
    
  TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
@ -66,22 +71,35 @@ class TermVectorsReader implements Cloneable {
    try {
      if (d.fileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION)) {
        tvx = d.openInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);
-        checkValidFormat(tvx);
+        format = checkValidFormat(tvx);
        tvd = d.openInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);
-        tvdFormat = checkValidFormat(tvd);
+        final int tvdFormat = checkValidFormat(tvd);
        tvf = d.openInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);
-        tvfFormat = checkValidFormat(tvf);
+        final int tvfFormat = checkValidFormat(tvf);
+
+        assert format == tvdFormat;
+        assert format == tvfFormat;
+
+        if (format >= FORMAT_VERSION2) {
+          assert (tvx.length()-FORMAT_SIZE) % 16 == 0;
+          numTotalDocs = (int) (tvx.length() >> 4);
+        } else {
+          assert (tvx.length()-FORMAT_SIZE) % 8 == 0;
+          numTotalDocs = (int) (tvx.length() >> 3);
+        }
+
        if (-1 == docStoreOffset) {
          this.docStoreOffset = 0;
-          this.size = (int) (tvx.length() >> 3);
+          this.size = numTotalDocs;
        } else {
          this.docStoreOffset = docStoreOffset;
          this.size = size;
          // Verify the file is long enough to hold all of our
          // docs
-          assert ((int) (tvx.length() / 8)) >= size + docStoreOffset;
+          assert numTotalDocs >= size + docStoreOffset;
        }
-      }
+      } else
+        format = 0;

      this.fieldInfos = fieldInfos;
      success = true;
@ -96,26 +114,94 @@ class TermVectorsReader implements Cloneable {
      }
    }
  }
-  
+
+  // Used for bulk copy when merging
+  IndexInput getTvdStream() {
+    return tvd;
+  }
+
+  // Used for bulk copy when merging
+  IndexInput getTvfStream() {
+    return tvf;
+  }
+
+  final private void seekTvx(final int docNum) throws IOException {
+    if (format < FORMAT_VERSION2)
+      tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
+    else
+      tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
+  }
+
+  boolean canReadRawDocs() {
+    return format >= FORMAT_VERSION2;
+  }
+
+  /** Retrieve the length (in bytes) of the tvd and tvf
+   *  entries for the next numDocs starting with
+   *  startDocID.  This is used for bulk copying when
+   *  merging segments, if the field numbers are
+   *  congruent.  Once this returns, the tvf & tvd streams
+   *  are seeked to the startDocID. */
+  final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {
+
+    if (tvx == null) {
+      Arrays.fill(tvdLengths, 0);
+      Arrays.fill(tvfLengths, 0);
+      return;
+    }
+
+    // SegmentMerger calls canReadRawDocs() first and should
+    // not call us if that returns false.
+    if (format < FORMAT_VERSION2)
+      throw new IllegalStateException("cannot read raw docs with older term vector formats");
+
+    seekTvx(startDocID);
+
+    long tvdPosition = tvx.readLong();
+    tvd.seek(tvdPosition);
+
+    long tvfPosition = tvx.readLong();
+    tvf.seek(tvfPosition);
+
+    long lastTvdPosition = tvdPosition;
+    long lastTvfPosition = tvfPosition;
+
+    int count = 0;
+    while (count < numDocs) {
+      final int docID = startDocID + count + 1;
+      if (docID < numTotalDocs)  {
+        tvdPosition = tvx.readLong();
+        tvfPosition = tvx.readLong();
+      } else {
+        tvdPosition = tvd.length();
+        tvfPosition = tvf.length();
+      }
+      tvdLengths[count] = (int) (tvdPosition-lastTvdPosition);
+      tvfLengths[count] = (int) (tvfPosition-lastTvfPosition);
+      count++;
+      lastTvdPosition = tvdPosition;
+      lastTvfPosition = tvfPosition;
+    }
+  }
+
  private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
  {
    int format = in.readInt();
-    if (format > FORMAT_VERSION)
-    {
+    if (format > FORMAT_VERSION2) {
      throw new CorruptIndexException("Incompatible format version: " + format + " expected " 
-                                      + FORMAT_VERSION + " or less");
+                                      + FORMAT_VERSION2 + " or less");
    }
    return format;
  }

  void close() throws IOException {
-  	// make all effort to close up. Keep the first exception
-  	// and throw it as a new one.
-  	IOException keep = null;
-  	if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; }
-  	if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }
-  	if (tvf  != null) try {  tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }
-  	if (keep != null) throw (IOException) keep.fillInStackTrace();
+    // make all effort to close up. Keep the first exception
+    // and throw it as a new one.
+    IOException keep = null;
+    if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; }
+    if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }
+    if (tvf  != null) try {  tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }
+    if (keep != null) throw (IOException) keep.fillInStackTrace();
  }

  /**
@ -133,11 +219,11 @@ class TermVectorsReader implements Cloneable {
      //We don't need to do this in other seeks because we already have the
      // file pointer
      //that was written in another file
-      tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
+      seekTvx(docNum);
      //System.out.println("TVX Pointer: " + tvx.getFilePointer());
-      long position = tvx.readLong();
+      long tvdPosition = tvx.readLong();

-      tvd.seek(position);
+      tvd.seek(tvdPosition);
      int fieldCount = tvd.readVInt();
      //System.out.println("Num Fields: " + fieldCount);
      // There are only a few fields per document. We opt for a full scan
@ -146,7 +232,7 @@ class TermVectorsReader implements Cloneable {
      int number = 0;
      int found = -1;
      for (int i = 0; i < fieldCount; i++) {
-        if(tvdFormat == FORMAT_VERSION)
+        if (format >= FORMAT_VERSION)
          number = tvd.readVInt();
        else
          number += tvd.readVInt();
@ -159,8 +245,12 @@ class TermVectorsReader implements Cloneable {
      // document
      if (found != -1) {
        // Compute position in the tvf file
-        position = 0;
-        for (int i = 0; i <= found; i++)
+        long position;
+        if (format >= FORMAT_VERSION2)
+          position = tvx.readLong();
+        else
+          position = tvd.readVLong();
+        for (int i = 1; i <= found; i++)
          position += tvd.readVLong();

        mapper.setDocumentNumber(docNum);
@ -190,6 +280,45 @@ class TermVectorsReader implements Cloneable {
    return mapper.materializeVector();
  }

+  // Reads the String[] fields; you have to pre-seek tvd to
+  // the right point
+  final private String[] readFields(int fieldCount) throws IOException {
+    int number = 0;
+    String[] fields = new String[fieldCount];
+
+    for (int i = 0; i < fieldCount; i++) {
+      if (format >= FORMAT_VERSION)
+        number = tvd.readVInt();
+      else
+        number += tvd.readVInt();
+
+      fields[i] = fieldInfos.fieldName(number);
+    }
+
+    return fields;
+  }
+
+  // Reads the long[] offsets into TVF; you have to pre-seek
+  // tvx/tvd to the right point
+  final private long[] readTvfPointers(int fieldCount) throws IOException {
+    // Compute position in the tvf file
+    long position;
+    if (format >= FORMAT_VERSION2)
+      position = tvx.readLong();
+    else
+      position = tvd.readVLong();
+
+    long[] tvfPointers = new long[fieldCount];
+    tvfPointers[0] = position;
+
+    for (int i = 1; i < fieldCount; i++) {
+      position += tvd.readVLong();
+      tvfPointers[i] = position;
+    }
+
+    return tvfPointers;
+  }
+
  /**
   * Return all term vectors stored for this document or null if the could not be read in.
   * 
@ -201,34 +330,16 @@ class TermVectorsReader implements Cloneable {
    TermFreqVector[] result = null;
    if (tvx != null) {
      //We need to offset by
-      tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
-      long position = tvx.readLong();
+      seekTvx(docNum);
+      long tvdPosition = tvx.readLong();

-      tvd.seek(position);
+      tvd.seek(tvdPosition);
      int fieldCount = tvd.readVInt();

      // No fields are vectorized for this document
      if (fieldCount != 0) {
-        int number = 0;
-        String[] fields = new String[fieldCount];
-
-        for (int i = 0; i < fieldCount; i++) {
-          if(tvdFormat == FORMAT_VERSION)
-            number = tvd.readVInt();
-          else
-            number += tvd.readVInt();
-
-          fields[i] = fieldInfos.fieldName(number);
-        }
-
-        // Compute position in the tvf file
-        position = 0;
-        long[] tvfPointers = new long[fieldCount];
-        for (int i = 0; i < fieldCount; i++) {
-          position += tvd.readVLong();
-          tvfPointers[i] = position;
-        }
-
+        final String[] fields = readFields(fieldCount);
+        final long[] tvfPointers = readTvfPointers(fieldCount);
        result = readTermVectors(docNum, fields, tvfPointers);
      }
    } else {
@ -241,34 +352,17 @@ class TermVectorsReader implements Cloneable {
    // Check if no term vectors are available for this segment at all
    if (tvx != null) {
      //We need to offset by
-      tvx.seek((docNumber * 8L) + FORMAT_SIZE);
-      long position = tvx.readLong();

-      tvd.seek(position);
+      seekTvx(docNumber);
+      long tvdPosition = tvx.readLong();
+
+      tvd.seek(tvdPosition);
      int fieldCount = tvd.readVInt();

      // No fields are vectorized for this document
      if (fieldCount != 0) {
-        int number = 0;
-        String[] fields = new String[fieldCount];
-
-        for (int i = 0; i < fieldCount; i++) {
-          if(tvdFormat == FORMAT_VERSION)
-            number = tvd.readVInt();
-          else
-            number += tvd.readVInt();
-
-          fields[i] = fieldInfos.fieldName(number);
-        }
-
-        // Compute position in the tvf file
-        position = 0;
-        long[] tvfPointers = new long[fieldCount];
-        for (int i = 0; i < fieldCount; i++) {
-          position += tvd.readVLong();
-          tvfPointers[i] = position;
-        }
-
+        final String[] fields = readFields(fieldCount);
+        final long[] tvfPointers = readTvfPointers(fieldCount);
        mapper.setDocumentNumber(docNumber);
        readTermVectors(fields, tvfPointers, mapper);
      }
@ -293,9 +387,8 @@ class TermVectorsReader implements Cloneable {
  private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper)
          throws IOException {
    for (int i = 0; i < fields.length; i++) {
-       readTermVector(fields[i], tvfPointers[i], mapper);
+      readTermVector(fields[i], tvfPointers[i], mapper);
    }
-
  }


@ -324,7 +417,7 @@ class TermVectorsReader implements Cloneable {
    boolean storePositions;
    boolean storeOffsets;
    
-    if(tvfFormat == FORMAT_VERSION){
+    if (format >= FORMAT_VERSION){
      byte bits = tvf.readByte();
      storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
      storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
@ -400,8 +493,6 @@ class TermVectorsReader implements Cloneable {
    }
  }

-
-
  protected Object clone() {
    
    if (tvx == null || tvd == null || tvf == null)
@ -418,11 +509,9 @@ class TermVectorsReader implements Cloneable {
    
    return clone;
  }
-
-
-
 }

+
 /**
 * Models the existing parallel array structure
 */
--- a/src/java/org/apache/lucene/index/TermVectorsWriter.java
+++ b/src/java/org/apache/lucene/index/TermVectorsWriter.java
@ -33,11 +33,11 @@ final class TermVectorsWriter {
    throws IOException {
    // Open files for TermVector storage
    tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
-    tvx.writeInt(TermVectorsReader.FORMAT_VERSION);
+    tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
    tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
-    tvd.writeInt(TermVectorsReader.FORMAT_VERSION);
+    tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
    tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
-    tvf.writeInt(TermVectorsReader.FORMAT_VERSION);
+    tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);

    this.fieldInfos = fieldInfos;
  }
@ -53,6 +53,7 @@ final class TermVectorsWriter {
      throws IOException {

    tvx.writeLong(tvd.getFilePointer());
+    tvx.writeLong(tvf.getFilePointer());

    if (vectors != null) {
      final int numFields = vectors.length;
@ -145,8 +146,8 @@ final class TermVectorsWriter {
      }

      // 2nd pass: write field pointers to tvd
-      long lastFieldPointer = 0;
-      for (int i=0; i<numFields; i++) {
+      long lastFieldPointer = fieldPointers[0];
+      for (int i=1; i<numFields; i++) {
        final long fieldPointer = fieldPointers[i];
        tvd.writeVLong(fieldPointer-lastFieldPointer);
        lastFieldPointer = fieldPointer;
@ -154,6 +155,28 @@ final class TermVectorsWriter {
    } else
      tvd.writeVInt(0);
  }
+
+  /**
+   * Do a bulk copy of numDocs documents from reader to our
+   * streams.  This is used to expedite merging, if the
+   * field numbers are congruent.
+   */
+  final void addRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException {
+    long tvdPosition = tvd.getFilePointer();
+    long tvfPosition = tvf.getFilePointer();
+    long tvdStart = tvdPosition;
+    long tvfStart = tvfPosition;
+    for(int i=0;i<numDocs;i++) {
+      tvx.writeLong(tvdPosition);
+      tvdPosition += tvdLengths[i];
+      tvx.writeLong(tvfPosition);
+      tvfPosition += tvfLengths[i];
+    }
+    tvd.copyBytes(reader.getTvdStream(), tvdPosition-tvdStart);
+    tvf.copyBytes(reader.getTvfStream(), tvfPosition-tvfStart);
+    assert tvd.getFilePointer() == tvdPosition;
+    assert tvf.getFilePointer() == tvfPosition;
+  }
  
  /** Close all streams. */
  final void close() throws IOException {
--- a/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
+++ b/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
@ -18,10 +18,16 @@ package org.apache.lucene.index;
 */

 import org.apache.lucene.util.LuceneTestCase;
-import java.util.Vector;
+
 import java.util.Arrays;
-import java.io.ByteArrayOutputStream;
-import java.io.ObjectOutputStream;
+import java.util.Enumeration;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipEntry;
+
+import java.io.OutputStream;
+import java.io.InputStream;
+import java.io.FileOutputStream;
+import java.io.BufferedOutputStream;
 import java.io.IOException;
 import java.io.File;

@ -33,9 +39,6 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import java.io.*;
-import java.util.*;
-import java.util.zip.*;

 /*
  Verify we can read the pre-2.1 file format, do searches
@ -50,11 +53,11 @@ public class TestBackwardsCompatibility extends LuceneTestCase

  /*
  public void testCreatePreLocklessCFS() throws IOException {
-    createIndex("src/test/org/apache/lucene/index/index.prelockless.cfs", true);
+    createIndex("src/test/org/apache/lucene/index/index.cfs", true);
  }

  public void testCreatePreLocklessNoCFS() throws IOException {
-    createIndex("src/test/org/apache/lucene/index/index.prelockless.nocfs", false);
+    createIndex("src/test/org/apache/lucene/index/index.nocfs", false);
  }
  */

@ -106,10 +109,14 @@ public class TestBackwardsCompatibility extends LuceneTestCase
    rmDir(dirName);
  }

-  final String[] oldNames = {"prelockless.cfs",
-                             "prelockless.nocfs",
-                             "presharedstores.cfs",
-                             "presharedstores.nocfs"};
+  final String[] oldNames = {"19.cfs",
+                             "19.nocfs",
+                             "20.cfs",
+                             "20.nocfs",
+                             "21.cfs",
+                             "21.nocfs",
+                             "22.cfs",
+                             "22.nocfs"};

  public void testSearchOldIndex() throws IOException {
    for(int i=0;i<oldNames.length;i++) {
@ -146,6 +153,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase
    }
  }

+  private void testHits(Hits hits, int expectedCount, IndexReader reader) throws IOException {
+    final int hitCount = hits.length();
+    assertEquals("wrong number of hits", expectedCount, hitCount);
+    for(int i=0;i<hitCount;i++) {
+      hits.doc(i);
+      reader.getTermFreqVectors(hits.id(i));
+    }
+  }
+
  public void searchIndex(String dirName) throws IOException {
    //QueryParser parser = new QueryParser("contents", new WhitespaceAnalyzer());
    //Query query = parser.parse("handle:1");
@ -156,12 +172,14 @@ public class TestBackwardsCompatibility extends LuceneTestCase
    IndexSearcher searcher = new IndexSearcher(dir);
    
    Hits hits = searcher.search(new TermQuery(new Term("content", "aaa")));
-    assertEquals(34, hits.length());
-    Document d = hits.doc(0);

-    // First document should be #21 since it's norm was increased:
+    // First document should be #21 since it's norm was
+    // increased:
+    Document d = hits.doc(0);
    assertEquals("didn't get the right document first", "21", d.get("id"));

+    testHits(hits, 34, searcher.getIndexReader());
+
    searcher.close();
    dir.close();
  }
@ -189,9 +207,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase
    // make sure searching sees right # hits
    IndexSearcher searcher = new IndexSearcher(dir);
    Hits hits = searcher.search(new TermQuery(new Term("content", "aaa")));
-    assertEquals("wrong number of hits", 44, hits.length());
    Document d = hits.doc(0);
    assertEquals("wrong first document", "21", d.get("id"));
+    testHits(hits, 44, searcher.getIndexReader());
    searcher.close();

    // make sure we can do delete & setNorm against this
@ -209,6 +227,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
    assertEquals("wrong number of hits", 43, hits.length());
    d = hits.doc(0);
    assertEquals("wrong first document", "22", d.get("id"));
+    testHits(hits, 43, searcher.getIndexReader());
    searcher.close();

    // optimize
@ -220,6 +239,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
    hits = searcher.search(new TermQuery(new Term("content", "aaa")));
    assertEquals("wrong number of hits", 43, hits.length());
    d = hits.doc(0);
+    testHits(hits, 43, searcher.getIndexReader());
    assertEquals("wrong first document", "22", d.get("id"));
    searcher.close();

@ -257,6 +277,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
    assertEquals("wrong number of hits", 33, hits.length());
    d = hits.doc(0);
    assertEquals("wrong first document", "22", d.get("id"));
+    testHits(hits, 33, searcher.getIndexReader());
    searcher.close();

    // optimize
@ -269,6 +290,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
    assertEquals("wrong number of hits", 33, hits.length());
    d = hits.doc(0);
    assertEquals("wrong first document", "22", d.get("id"));
+    testHits(hits, 33, searcher.getIndexReader());
    searcher.close();

    dir.close();
@ -283,6 +305,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
    Directory dir = FSDirectory.getDirectory(dirName);
    IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
    writer.setUseCompoundFile(doCFS);
+    writer.setMaxBufferedDocs(10);
    
    for(int i=0;i<35;i++) {
      addDoc(writer, i);
@ -393,6 +416,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
    Document doc = new Document();
    doc.add(new Field("content", "aaa", Field.Store.NO, Field.Index.TOKENIZED));
    doc.add(new Field("id", Integer.toString(id), Field.Store.YES, Field.Index.UN_TOKENIZED));
+    doc.add(new Field("content2", "here is more content with aaa aaa aaa", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
    writer.addDocument(doc);
  }

--- a/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/src/test/org/apache/lucene/index/TestIndexWriter.java
@ -1937,9 +1937,10 @@ public class TestIndexWriter extends LuceneTestCase
      for(int j=0;j<reader.maxDoc();j++) {
        if (reader.isDeleted(j))
          numDel++;
-        else
+        else {
          reader.document(j);
-        reader.getTermFreqVectors(j);
+          reader.getTermFreqVectors(j);
+        }
      }
      reader.close();

@ -1963,9 +1964,10 @@ public class TestIndexWriter extends LuceneTestCase
      for(int j=0;j<reader.maxDoc();j++) {
        if (reader.isDeleted(j))
          numDel++;
-        else
+        else {
          reader.document(j);
-        reader.getTermFreqVectors(j);
+          reader.getTermFreqVectors(j);
+        }
      }
      reader.close();
      assertEquals(0, numDel);
@ -2053,9 +2055,10 @@ public class TestIndexWriter extends LuceneTestCase
      for(int j=0;j<reader.maxDoc();j++) {
        if (reader.isDeleted(j))
          numDel++;
-        else
+        else {
          reader.document(j);
-        reader.getTermFreqVectors(j);
+          reader.getTermFreqVectors(j);
+        }
      }
      reader.close();

@ -2079,9 +2082,10 @@ public class TestIndexWriter extends LuceneTestCase
      for(int j=0;j<reader.maxDoc();j++) {
        if (reader.isDeleted(j))
          numDel++;
-        else
+        else {
          reader.document(j);
-        reader.getTermFreqVectors(j);
+          reader.getTermFreqVectors(j);
+        }
      }
      reader.close();
      assertEquals(0, numDel);
--- a/src/test/org/apache/lucene/index/index.19.cfs.zip
+++ b/src/test/org/apache/lucene/index/index.19.cfs.zip
--- a/src/test/org/apache/lucene/index/index.19.nocfs.zip
+++ b/src/test/org/apache/lucene/index/index.19.nocfs.zip
--- a/src/test/org/apache/lucene/index/index.20.cfs.zip
+++ b/src/test/org/apache/lucene/index/index.20.cfs.zip
--- a/src/test/org/apache/lucene/index/index.20.nocfs.zip
+++ b/src/test/org/apache/lucene/index/index.20.nocfs.zip
--- a/src/test/org/apache/lucene/index/index.21.cfs.zip
+++ b/src/test/org/apache/lucene/index/index.21.cfs.zip
--- a/src/test/org/apache/lucene/index/index.21.nocfs.zip
+++ b/src/test/org/apache/lucene/index/index.21.nocfs.zip
--- a/src/test/org/apache/lucene/index/index.22.cfs.zip
+++ b/src/test/org/apache/lucene/index/index.22.cfs.zip
--- a/src/test/org/apache/lucene/index/index.22.nocfs.zip
+++ b/src/test/org/apache/lucene/index/index.22.nocfs.zip
--- a/src/test/org/apache/lucene/index/index.prelockless.cfs.zip
+++ b/src/test/org/apache/lucene/index/index.prelockless.cfs.zip
--- a/src/test/org/apache/lucene/index/index.prelockless.nocfs.zip
+++ b/src/test/org/apache/lucene/index/index.prelockless.nocfs.zip
--- a/src/test/org/apache/lucene/index/index.presharedstores.cfs.zip
+++ b/src/test/org/apache/lucene/index/index.presharedstores.cfs.zip
--- a/src/test/org/apache/lucene/index/index.presharedstores.nocfs.zip
+++ b/src/test/org/apache/lucene/index/index.presharedstores.nocfs.zip