LUCENE-3613: split out 4.0/3.x term vectors implementations

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1232652 13f79535-47bb-0310-9956-ffa450edef68
2012-01-17 23:47:14 +00:00 · 2012-01-17 23:47:14 +00:00 · d159f25b63
parent c4f57c6081
commit d159f25b63
7 changed files with 993 additions and 38 deletions
--- a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xCodec.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xCodec.java
@ -51,8 +51,7 @@ public class Lucene3xCodec extends Codec {
  // TODO: this should really be a different impl
  private final StoredFieldsFormat fieldsFormat = new Lucene40StoredFieldsFormat();
-  // TODO: this should really be a different impl
+  private final TermVectorsFormat vectorsFormat = new Lucene3xTermVectorsFormat();
  private final TermVectorsFormat vectorsFormat = new Lucene40TermVectorsFormat();
  private final FieldInfosFormat fieldInfosFormat = new Lucene3xFieldInfosFormat();
--- a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsFormat.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsFormat.java
@ -0,0 +1,49 @@
 package org.apache.lucene.codecs.lucene3x;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.util.Set;
 import org.apache.lucene.codecs.TermVectorsFormat;
 import org.apache.lucene.codecs.TermVectorsReader;
 import org.apache.lucene.codecs.TermVectorsWriter;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 public class Lucene3xTermVectorsFormat extends TermVectorsFormat {
  @Override
  public TermVectorsReader vectorsReader(Directory directory,SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException {
    return new Lucene3xTermVectorsReader(directory, segmentInfo, fieldInfos, context);
  }
  @Override
  public TermVectorsWriter vectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
    // TODO all these IAEs in preflex should be UOEs?
    throw new IllegalArgumentException("this codec can only be used for reading");
  }
  @Override
  public void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
    Lucene3xTermVectorsReader.files(dir, info, files);
  }
 }
--- a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsReader.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsReader.java
@ -0,0 +1,672 @@
 package org.apache.lucene.codecs.lucene3x;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
 import org.apache.lucene.codecs.TermVectorsReader;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.FieldsEnum;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.IndexFormatTooNewException;
 import org.apache.lucene.index.IndexFormatTooOldException;
 import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 public class Lucene3xTermVectorsReader extends TermVectorsReader {
  // NOTE: if you make a new format, it must be larger than
  // the current format
  // Changed strings to UTF8 with length-in-bytes not length-in-chars
  static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
  // NOTE: always change this if you switch to a new format!
  // whenever you add a new format, make it 1 larger (positive version logic)!
  public static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
  // when removing support for old versions, leave the last supported version here
  public static final int FORMAT_MINIMUM = FORMAT_UTF8_LENGTH_IN_BYTES;
  //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file 
  static final int FORMAT_SIZE = 4;
  public static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
  public static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
  /** Extension of vectors fields file */
  public static final String VECTORS_FIELDS_EXTENSION = "tvf";
  /** Extension of vectors documents file */
  public static final String VECTORS_DOCUMENTS_EXTENSION = "tvd";
  /** Extension of vectors index file */
  public static final String VECTORS_INDEX_EXTENSION = "tvx";
  private FieldInfos fieldInfos;
  private IndexInput tvx;
  private IndexInput tvd;
  private IndexInput tvf;
  private int size;
  private int numTotalDocs;
  // The docID offset where our docs begin in the index
  // file.  This will be 0 if we have our own private file.
  private int docStoreOffset;
  private final int format;
  // used by clone
  Lucene3xTermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int docStoreOffset, int format) {
    this.fieldInfos = fieldInfos;
    this.tvx = tvx;
    this.tvd = tvd;
    this.tvf = tvf;
    this.size = size;
    this.numTotalDocs = numTotalDocs;
    this.docStoreOffset = docStoreOffset;
    this.format = format;
  }
  public Lucene3xTermVectorsReader(Directory d, SegmentInfo si, FieldInfos fieldInfos, IOContext context)
    throws CorruptIndexException, IOException {
    final String segment = si.getDocStoreSegment();
    final int docStoreOffset = si.getDocStoreOffset();
    final int size = si.docCount;
    boolean success = false;
    try {
      String idxName = IndexFileNames.segmentFileName(segment, "", VECTORS_INDEX_EXTENSION);
      tvx = d.openInput(idxName, context);
      format = checkValidFormat(tvx);
      String fn = IndexFileNames.segmentFileName(segment, "", VECTORS_DOCUMENTS_EXTENSION);
      tvd = d.openInput(fn, context);
      final int tvdFormat = checkValidFormat(tvd);
      fn = IndexFileNames.segmentFileName(segment, "", VECTORS_FIELDS_EXTENSION);
      tvf = d.openInput(fn, context);
      final int tvfFormat = checkValidFormat(tvf);
      assert format == tvdFormat;
      assert format == tvfFormat;
      numTotalDocs = (int) (tvx.length() >> 4);
      if (-1 == docStoreOffset) {
        this.docStoreOffset = 0;
        this.size = numTotalDocs;
        assert size == 0 || numTotalDocs == size;
      } else {
        this.docStoreOffset = docStoreOffset;
        this.size = size;
        // Verify the file is long enough to hold all of our
        // docs
        assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset;
      }
      this.fieldInfos = fieldInfos;
      success = true;
    } finally {
      // With lock-less commits, it's entirely possible (and
      // fine) to hit a FileNotFound exception above. In
      // this case, we want to explicitly close any subset
      // of things that were opened so that we don't have to
      // wait for a GC to do so.
      if (!success) {
        close();
      }
    }
  }
  // Not private to avoid synthetic access$NNN methods
  void seekTvx(final int docNum) throws IOException {
    tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
  }
  private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
  {
    int format = in.readInt();
    if (format < FORMAT_MINIMUM)
      throw new IndexFormatTooOldException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
    if (format > FORMAT_CURRENT)
      throw new IndexFormatTooNewException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
    return format;
  }
  public void close() throws IOException {
    IOUtils.close(tvx, tvd, tvf);
  }
  /**
   * 
   * @return The number of documents in the reader
   */
  int size() {
    return size;
  }
  private class TVFields extends Fields {
    private final int[] fieldNumbers;
    private final long[] fieldFPs;
    private final Map<Integer,Integer> fieldNumberToIndex = new HashMap<Integer,Integer>();
    public TVFields(int docID) throws IOException {
      seekTvx(docID);
      tvd.seek(tvx.readLong());
      final int fieldCount = tvd.readVInt();
      assert fieldCount >= 0;
      if (fieldCount != 0) {
        fieldNumbers = new int[fieldCount];
        fieldFPs = new long[fieldCount];
        for(int fieldUpto=0;fieldUpto<fieldCount;fieldUpto++) {
          final int fieldNumber = tvd.readVInt();
          fieldNumbers[fieldUpto] = fieldNumber;
          fieldNumberToIndex.put(fieldNumber, fieldUpto);
        }
        long position = tvx.readLong();
        fieldFPs[0] = position;
        for(int fieldUpto=1;fieldUpto<fieldCount;fieldUpto++) {
          position += tvd.readVLong();
          fieldFPs[fieldUpto] = position;
        }
      } else {
        // TODO: we can improve writer here, eg write 0 into
        // tvx file, so we know on first read from tvx that
        // this doc has no TVs
        fieldNumbers = null;
        fieldFPs = null;
      }
    }
    @Override
    public FieldsEnum iterator() throws IOException {
      return new FieldsEnum() {
        private int fieldUpto;
        @Override
        public String next() throws IOException {
          if (fieldNumbers != null && fieldUpto < fieldNumbers.length) {
            return fieldInfos.fieldName(fieldNumbers[fieldUpto++]);
          } else {
            return null;
          }
        }
        @Override
        public Terms terms() throws IOException {
          return TVFields.this.terms(fieldInfos.fieldName(fieldNumbers[fieldUpto-1]));
        }
      };
    }
    @Override
    public Terms terms(String field) throws IOException {
      final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
      if (fieldInfo == null) {
        // No such field
        return null;
      }
      final Integer fieldIndex = fieldNumberToIndex.get(fieldInfo.number);
      if (fieldIndex == null) {
        // Term vectors were not indexed for this field
        return null;
      }
      return new TVTerms(fieldFPs[fieldIndex]);
    }
    @Override
    public int getUniqueFieldCount() {
      if (fieldNumbers == null) {
        return 0;
      } else {
        return fieldNumbers.length;
      }
    }
  }
  private class TVTerms extends Terms {
    private final int numTerms;
    private final long tvfFPStart;
    public TVTerms(long tvfFP) throws IOException {
      tvf.seek(tvfFP);
      numTerms = tvf.readVInt();
      tvfFPStart = tvf.getFilePointer();
    }
    @Override
    public TermsEnum iterator(TermsEnum reuse) throws IOException {
      TVTermsEnum termsEnum;
      if (reuse instanceof TVTermsEnum) {
        termsEnum = (TVTermsEnum) reuse;
        if (!termsEnum.canReuse(tvf)) {
          termsEnum = new TVTermsEnum();
        }
      } else {
        termsEnum = new TVTermsEnum();
      }
      termsEnum.reset(numTerms, tvfFPStart);
      return termsEnum;
    }
    @Override
    public long getUniqueTermCount() {
      return numTerms;
    }
    @Override
    public long getSumTotalTermFreq() {
      return -1;
    }
    @Override
    public long getSumDocFreq() {
      // Every term occurs in just one doc:
      return numTerms;
    }
    @Override
    public int getDocCount() {
      return 1;
    }
    @Override
    public Comparator<BytesRef> getComparator() {
      // TODO: really indexer hardwires
      // this...?  I guess codec could buffer and re-sort...
      return BytesRef.getUTF8SortedAsUnicodeComparator();
    }
  }
  private class TVTermsEnum extends TermsEnum {
    private final IndexInput origTVF;
    private final IndexInput tvf;
    private int numTerms;
    private int nextTerm;
    private int freq;
    private BytesRef lastTerm = new BytesRef();
    private BytesRef term = new BytesRef();
    private boolean storePositions;
    private boolean storeOffsets;
    private long tvfFP;
    private int[] positions;
    private int[] startOffsets;
    private int[] endOffsets;
    // NOTE: tvf is pre-positioned by caller
    public TVTermsEnum() throws IOException {
      this.origTVF = Lucene3xTermVectorsReader.this.tvf;
      tvf = (IndexInput) origTVF.clone();
    }
    public boolean canReuse(IndexInput tvf) {
      return tvf == origTVF;
    }
    public void reset(int numTerms, long tvfFPStart) throws IOException {
      this.numTerms = numTerms;
      nextTerm = 0;
      tvf.seek(tvfFPStart);
      final byte bits = tvf.readByte();
      storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
      storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
      tvfFP = 1+tvfFPStart;
      positions = null;
      startOffsets = null;
      endOffsets = null;
    }
    // NOTE: slow!  (linear scan)
    @Override
    public SeekStatus seekCeil(BytesRef text, boolean useCache)
      throws IOException {
      if (nextTerm != 0 && text.compareTo(term) < 0) {
        nextTerm = 0;
        tvf.seek(tvfFP);
      }
      while (next() != null) {
        final int cmp = text.compareTo(term);
        if (cmp < 0) {
          return SeekStatus.NOT_FOUND;
        } else if (cmp == 0) {
          return SeekStatus.FOUND;
        }
      }
      return SeekStatus.END;
    }
    @Override
    public void seekExact(long ord) {
      throw new UnsupportedOperationException();
    }
    @Override
    public BytesRef next() throws IOException {
      if (nextTerm >= numTerms) {
        return null;
      }
      term.copyBytes(lastTerm);
      final int start = tvf.readVInt();
      final int deltaLen = tvf.readVInt();
      term.length = start + deltaLen;
      term.grow(term.length);
      tvf.readBytes(term.bytes, start, deltaLen);
      freq = tvf.readVInt();
      if (storePositions) {
        // TODO: we could maybe reuse last array, if we can
        // somehow be careful about consumer never using two
        // D&PEnums at once...
        positions = new int[freq];
        int pos = 0;
        for(int posUpto=0;posUpto<freq;posUpto++) {
          pos += tvf.readVInt();
          positions[posUpto] = pos;
        }
      }
      if (storeOffsets) {
        startOffsets = new int[freq];
        endOffsets = new int[freq];
        int offset = 0;
        for(int posUpto=0;posUpto<freq;posUpto++) {
          startOffsets[posUpto] = offset + tvf.readVInt();
          offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.readVInt();
        }
      }
      lastTerm.copyBytes(term);
      nextTerm++;
      return term;
    }
    @Override
    public BytesRef term() {
      return term;
    }
    @Override
    public long ord() {
      throw new UnsupportedOperationException();
    }
    @Override
    public int docFreq() {
      return 1;
    }
    @Override
    public long totalTermFreq() {
      return freq;
    }
    @Override
    public DocsEnum docs(Bits liveDocs, DocsEnum reuse, boolean needsFreqs /* ignored */) throws IOException {
      TVDocsEnum docsEnum;
      if (reuse != null && reuse instanceof TVDocsEnum) {
        docsEnum = (TVDocsEnum) reuse;
      } else {
        docsEnum = new TVDocsEnum();
      }
      docsEnum.reset(liveDocs, freq);
      return docsEnum;
    }
    @Override
    public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
      if (needsOffsets && !storeOffsets) {
        return null;
      }
      if (!storePositions && !storeOffsets) {
        return null;
      }
      TVDocsAndPositionsEnum docsAndPositionsEnum;
      if (reuse != null && reuse instanceof TVDocsAndPositionsEnum) {
        docsAndPositionsEnum = (TVDocsAndPositionsEnum) reuse;
      } else {
        docsAndPositionsEnum = new TVDocsAndPositionsEnum();
      }
      docsAndPositionsEnum.reset(liveDocs, positions, startOffsets, endOffsets);
      return docsAndPositionsEnum;
    }
    @Override
    public Comparator<BytesRef> getComparator() {
      // TODO: really indexer hardwires
      // this...?  I guess codec could buffer and re-sort...
      return BytesRef.getUTF8SortedAsUnicodeComparator();
    }
  }
  // NOTE: sort of a silly class, since you can get the
  // freq() already by TermsEnum.totalTermFreq
  private static class TVDocsEnum extends DocsEnum {
    private boolean didNext;
    private int doc = -1;
    private int freq;
    private Bits liveDocs;
    @Override
    public int freq() {
      return freq;
    }
    @Override
    public int docID() {
      return doc;
    }
    @Override
    public int nextDoc() {
      if (!didNext && (liveDocs == null || liveDocs.get(0))) {
        didNext = true;
        return (doc = 0);
      } else {
        return (doc = NO_MORE_DOCS);
      }
    }
    @Override
    public int advance(int target) {
      if (!didNext && target == 0) {
        return nextDoc();
      } else {
        return (doc = NO_MORE_DOCS);
      }
    }
    public void reset(Bits liveDocs, int freq) {
      this.liveDocs = liveDocs;
      this.freq = freq;
      this.doc = -1;
      didNext = false;
    }
  }
  private static class TVDocsAndPositionsEnum extends DocsAndPositionsEnum {
    private boolean didNext;
    private int doc = -1;
    private int nextPos;
    private Bits liveDocs;
    private int[] positions;
    private int[] startOffsets;
    private int[] endOffsets;
    @Override
    public int freq() {
      if (positions != null) {
        return positions.length;
      } else {
        assert startOffsets != null;
        return startOffsets.length;
      }
    }
    @Override
    public int docID() {
      return doc;
    }
    @Override
    public int nextDoc() {
      if (!didNext && (liveDocs == null || liveDocs.get(0))) {
        didNext = true;
        return (doc = 0);
      } else {
        return (doc = NO_MORE_DOCS);
      }
    }
    @Override
    public int advance(int target) {
      if (!didNext && target == 0) {
        return nextDoc();
      } else {
        return (doc = NO_MORE_DOCS);
      }
    }
    public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets) {
      this.liveDocs = liveDocs;
      this.positions = positions;
      this.startOffsets = startOffsets;
      this.endOffsets = endOffsets;
      this.doc = -1;
      didNext = false;
      nextPos = 0;
    }
    @Override
    public BytesRef getPayload() {
      return null;
    }
    @Override
    public boolean hasPayload() {
      return false;
    }
    @Override
    public int nextPosition() {
      assert (positions != null && nextPos < positions.length) ||
        startOffsets != null && nextPos < startOffsets.length;
      if (positions != null) {
        return positions[nextPos++];
      } else {
        nextPos++;
        return -1;
      }
    }
    @Override
    public int startOffset() {
      assert startOffsets != null;
      return startOffsets[nextPos-1];
    }
    @Override
    public int endOffset() {
      assert endOffsets != null;
      return endOffsets[nextPos-1];
    }
  }
  @Override
  public Fields get(int docID) throws IOException {
    if (docID < 0 || docID >= numTotalDocs) {
      throw new IllegalArgumentException("doID=" + docID + " is out of bounds [0.." + (numTotalDocs-1) + "]");
    }
    if (tvx != null) {
      Fields fields = new TVFields(docID);
      if (fields.getUniqueFieldCount() == 0) {
        // TODO: we can improve writer here, eg write 0 into
        // tvx file, so we know on first read from tvx that
        // this doc has no TVs
        return null;
      } else {
        return fields;
      }
    } else {
      return null;
    }
  }
  @Override
  public TermVectorsReader clone() {
    IndexInput cloneTvx = null;
    IndexInput cloneTvd = null;
    IndexInput cloneTvf = null;
    // These are null when a TermVectorsReader was created
    // on a segment that did not have term vectors saved
    if (tvx != null && tvd != null && tvf != null) {
      cloneTvx = (IndexInput) tvx.clone();
      cloneTvd = (IndexInput) tvd.clone();
      cloneTvf = (IndexInput) tvf.clone();
    }
    return new Lucene3xTermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, docStoreOffset, format);
  }
  public static void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
    if (info.getHasVectors()) {
      if (info.getDocStoreOffset() != -1) {
        assert info.getDocStoreSegment() != null;
        if (!info.getDocStoreIsCompoundFile()) {
          files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_INDEX_EXTENSION));
          files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_FIELDS_EXTENSION));
          files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_DOCUMENTS_EXTENSION));
        }
      } else {
        files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_INDEX_EXTENSION));
        files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_FIELDS_EXTENSION));
        files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_DOCUMENTS_EXTENSION));
      }
    }
  }
 }
--- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java
@ -74,8 +74,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
  static final String VECTORS_DOCUMENTS_EXTENSION = "tvd";
  /** Extension of vectors index file */
-  // TODO: shouldnt be visible to segments reader, preflex should do this itself somehow
+  static final String VECTORS_INDEX_EXTENSION = "tvx";
  public static final String VECTORS_INDEX_EXTENSION = "tvx";
  private FieldInfos fieldInfos;
@ -84,29 +83,23 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
  private IndexInput tvf;
  private int size;
  private int numTotalDocs;
  // The docID offset where our docs begin in the index
  // file.  This will be 0 if we have our own private file.
  private int docStoreOffset;
  private final int format;
  // used by clone
-  Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int docStoreOffset, int format) {
+  Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int format) {
    this.fieldInfos = fieldInfos;
    this.tvx = tvx;
    this.tvd = tvd;
    this.tvf = tvf;
    this.size = size;
    this.numTotalDocs = numTotalDocs;
    this.docStoreOffset = docStoreOffset;
    this.format = format;
  }
  public Lucene40TermVectorsReader(Directory d, SegmentInfo si, FieldInfos fieldInfos, IOContext context)
    throws CorruptIndexException, IOException {
-    final String segment = si.getDocStoreSegment();
+    final String segment = si.name;
    final int docStoreOffset = si.getDocStoreOffset();
    final int size = si.docCount;
    boolean success = false;
@ -127,17 +120,8 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
      numTotalDocs = (int) (tvx.length() >> 4);
-      if (-1 == docStoreOffset) {
+      this.size = numTotalDocs;
-        this.docStoreOffset = 0;
+      assert size == 0 || numTotalDocs == size;
        this.size = numTotalDocs;
        assert size == 0 || numTotalDocs == size;
      } else {
        this.docStoreOffset = docStoreOffset;
        this.size = size;
        // Verify the file is long enough to hold all of our
        // docs
        assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset;
      }
      this.fieldInfos = fieldInfos;
      success = true;
@ -165,7 +149,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
  // Not private to avoid synthetic access$NNN methods
  void seekTvx(final int docNum) throws IOException {
-    tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
+    tvx.seek(docNum * 16L + FORMAT_SIZE);
  }
  boolean canReadRawDocs() {
@ -201,7 +185,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
    int count = 0;
    while (count < numDocs) {
-      final int docID = docStoreOffset + startDocID + count + 1;
+      final int docID = startDocID + count + 1;
      assert docID <= numTotalDocs;
      if (docID < numTotalDocs)  {
        tvdPosition = tvx.readLong();
@ -712,23 +696,14 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
      cloneTvf = (IndexInput) tvf.clone();
    }
-    return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, docStoreOffset, format);
+    return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, format);
  }
  public static void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
    if (info.getHasVectors()) {
-      if (info.getDocStoreOffset() != -1) {
+      files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_INDEX_EXTENSION));
-        assert info.getDocStoreSegment() != null;
+      files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_FIELDS_EXTENSION));
-        if (!info.getDocStoreIsCompoundFile()) {
+      files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_DOCUMENTS_EXTENSION));
          files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_INDEX_EXTENSION));
          files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_FIELDS_EXTENSION));
          files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_DOCUMENTS_EXTENSION));
        }
      } else {
        files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_INDEX_EXTENSION));
        files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_FIELDS_EXTENSION));
        files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_DOCUMENTS_EXTENSION));
      }
    }
  }
 }
--- a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWCodec.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWCodec.java
@ -20,6 +20,7 @@ package org.apache.lucene.codecs.preflexrw;
 import org.apache.lucene.codecs.FieldInfosFormat;
 import org.apache.lucene.codecs.NormsFormat;
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.TermVectorsFormat;
 import org.apache.lucene.codecs.lucene3x.Lucene3xCodec;
 import org.apache.lucene.util.LuceneTestCase;
@ -31,6 +32,7 @@ public class PreFlexRWCodec extends Lucene3xCodec {
  private final PostingsFormat postings = new PreFlexRWPostingsFormat();
  private final NormsFormat norms = new PreFlexRWNormsFormat();
  private final FieldInfosFormat fieldInfos = new PreFlexRWFieldInfosFormat();
  private final TermVectorsFormat termVectors = new PreFlexRWTermVectorsFormat();
  @Override
  public PostingsFormat postingsFormat() {
@ -58,4 +60,13 @@ public class PreFlexRWCodec extends Lucene3xCodec {
      return super.fieldInfosFormat();
    }
  }
  @Override
  public TermVectorsFormat termVectorsFormat() {
    if (LuceneTestCase.PREFLEX_IMPERSONATION_IS_ACTIVE) {
      return termVectors;
    } else {
      return super.termVectorsFormat();
    }
  }
 }
--- a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsFormat.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsFormat.java
@ -0,0 +1,33 @@
 package org.apache.lucene.codecs.preflexrw;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.codecs.TermVectorsWriter;
 import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsFormat;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 public class PreFlexRWTermVectorsFormat extends Lucene3xTermVectorsFormat {
  @Override
  public TermVectorsWriter vectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
    return new PreFlexRWTermVectorsWriter(directory, segment, context);
  }
 }
--- a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsWriter.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsWriter.java
@ -0,0 +1,216 @@
 package org.apache.lucene.codecs.preflexrw;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.codecs.TermVectorsWriter;
 import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsReader;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.StringHelper;
 // TODO: surrogates dance!
 public final class PreFlexRWTermVectorsWriter extends TermVectorsWriter {
  private final Directory directory;
  private final String segment;
  private IndexOutput tvx = null, tvd = null, tvf = null;
  public PreFlexRWTermVectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
    this.directory = directory;
    this.segment = segment;
    boolean success = false;
    try {
      // Open files for TermVector storage
      tvx = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_INDEX_EXTENSION), context);
      tvx.writeInt(Lucene3xTermVectorsReader.FORMAT_CURRENT);
      tvd = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_DOCUMENTS_EXTENSION), context);
      tvd.writeInt(Lucene3xTermVectorsReader.FORMAT_CURRENT);
      tvf = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_FIELDS_EXTENSION), context);
      tvf.writeInt(Lucene3xTermVectorsReader.FORMAT_CURRENT);
      success = true;
    } finally {
      if (!success) {
        abort();
      }
    }
  }
  @Override
  public void startDocument(int numVectorFields) throws IOException {
    lastFieldName = null;
    this.numVectorFields = numVectorFields;
    tvx.writeLong(tvd.getFilePointer());
    tvx.writeLong(tvf.getFilePointer());
    tvd.writeVInt(numVectorFields);
    fieldCount = 0;
    fps = ArrayUtil.grow(fps, numVectorFields);
  }
  private long fps[] = new long[10]; // pointers to the tvf before writing each field 
  private int fieldCount = 0;        // number of fields we have written so far for this document
  private int numVectorFields = 0;   // total number of fields we will write for this document
  private String lastFieldName;
  @Override
  public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets) throws IOException {
    assert lastFieldName == null || info.name.compareTo(lastFieldName) > 0: "fieldName=" + info.name + " lastFieldName=" + lastFieldName;
    lastFieldName = info.name;
    this.positions = positions;
    this.offsets = offsets;
    lastTerm.length = 0;
    fps[fieldCount++] = tvf.getFilePointer();
    tvd.writeVInt(info.number);
    tvf.writeVInt(numTerms);
    byte bits = 0x0;
    if (positions)
      bits |= Lucene3xTermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
    if (offsets)
      bits |= Lucene3xTermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
    tvf.writeByte(bits);
    assert fieldCount <= numVectorFields;
    if (fieldCount == numVectorFields) {
      // last field of the document
      // this is crazy because the file format is crazy!
      for (int i = 1; i < fieldCount; i++) {
        tvd.writeVLong(fps[i] - fps[i-1]);
      }
    }
  }
  private final BytesRef lastTerm = new BytesRef(10);
  // NOTE: we override addProx, so we don't need to buffer when indexing.
  // we also don't buffer during bulk merges.
  private int offsetStartBuffer[] = new int[10];
  private int offsetEndBuffer[] = new int[10];
  private int offsetIndex = 0;
  private int offsetFreq = 0;
  private boolean positions = false;
  private boolean offsets = false;
  @Override
  public void startTerm(BytesRef term, int freq) throws IOException {
    final int prefix = StringHelper.bytesDifference(lastTerm, term);
    final int suffix = term.length - prefix;
    tvf.writeVInt(prefix);
    tvf.writeVInt(suffix);
    tvf.writeBytes(term.bytes, term.offset + prefix, suffix);
    tvf.writeVInt(freq);
    lastTerm.copyBytes(term);
    lastPosition = lastOffset = 0;
    if (offsets && positions) {
      // we might need to buffer if its a non-bulk merge
      offsetStartBuffer = ArrayUtil.grow(offsetStartBuffer, freq);
      offsetEndBuffer = ArrayUtil.grow(offsetEndBuffer, freq);
      offsetIndex = 0;
      offsetFreq = freq;
    }
  }
  int lastPosition = 0;
  int lastOffset = 0;
  @Override
  public void addProx(int numProx, DataInput positions, DataInput offsets) throws IOException {
    // TODO: technically we could just copy bytes and not re-encode if we knew the length...
    if (positions != null) {
      for (int i = 0; i < numProx; i++) {
        tvf.writeVInt(positions.readVInt());
      }
    }
    if (offsets != null) {
      for (int i = 0; i < numProx; i++) {
        tvf.writeVInt(offsets.readVInt());
        tvf.writeVInt(offsets.readVInt());
      }
    }
  }
  @Override
  public void addPosition(int position, int startOffset, int endOffset) throws IOException {
    if (positions && offsets) {
      // write position delta
      tvf.writeVInt(position - lastPosition);
      lastPosition = position;
      // buffer offsets
      offsetStartBuffer[offsetIndex] = startOffset;
      offsetEndBuffer[offsetIndex] = endOffset;
      offsetIndex++;
      // dump buffer if we are done
      if (offsetIndex == offsetFreq) {
        for (int i = 0; i < offsetIndex; i++) {
          tvf.writeVInt(offsetStartBuffer[i] - lastOffset);
          tvf.writeVInt(offsetEndBuffer[i] - offsetStartBuffer[i]);
          lastOffset = offsetEndBuffer[i];
        }
      }
    } else if (positions) {
      // write position delta
      tvf.writeVInt(position - lastPosition);
      lastPosition = position;
    } else if (offsets) {
      // write offset deltas
      tvf.writeVInt(startOffset - lastOffset);
      tvf.writeVInt(endOffset - startOffset);
      lastOffset = endOffset;
    }
  }
  @Override
  public void abort() {
    try {
      close();
    } catch (IOException ignored) {}
    IOUtils.deleteFilesIgnoringExceptions(directory, IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_INDEX_EXTENSION),
        IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_DOCUMENTS_EXTENSION),
        IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_FIELDS_EXTENSION));
  }
  @Override
  public void finish(int numDocs) throws IOException {
    if (4+((long) numDocs)*16 != tvx.getFilePointer())
      // This is most likely a bug in Sun JRE 1.6.0_04/_05;
      // we detect that the bug has struck, here, and
      // throw an exception to prevent the corruption from
      // entering the index.  See LUCENE-1282 for
      // details.
      throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + numDocs + " but tvx size is " + tvx.getFilePointer() + " file=" + tvx.toString() + "; now aborting this merge to prevent index corruption");
  }
  /** Close all streams. */
  @Override
  public void close() throws IOException {
    // make an effort to close all streams we can but remember and re-throw
    // the first exception encountered in this process
    IOUtils.close(tvx, tvd, tvf);
    tvx = tvd = tvf = null;
  }
 }