LUCENE-3613: split out 4.0/3.x term vectors implementations

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1232652 13f79535-47bb-0310-9956-ffa450edef68
2012-01-17 23:47:14 +00:00 · 2012-01-17 23:47:14 +00:00 · d159f25b63
parent c4f57c6081
commit d159f25b63
7 changed files with 993 additions and 38 deletions
--- a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xCodec.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xCodec.java
@ -51,8 +51,7 @@ public class Lucene3xCodec extends Codec {
  // TODO: this should really be a different impl
  private final StoredFieldsFormat fieldsFormat = new Lucene40StoredFieldsFormat();
  
-  // TODO: this should really be a different impl
-  private final TermVectorsFormat vectorsFormat = new Lucene40TermVectorsFormat();
+  private final TermVectorsFormat vectorsFormat = new Lucene3xTermVectorsFormat();
  
  private final FieldInfosFormat fieldInfosFormat = new Lucene3xFieldInfosFormat();

--- a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsFormat.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsFormat.java
@ -0,0 +1,49 @@
+package org.apache.lucene.codecs.lucene3x;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.codecs.TermVectorsFormat;
+import org.apache.lucene.codecs.TermVectorsReader;
+import org.apache.lucene.codecs.TermVectorsWriter;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+
+public class Lucene3xTermVectorsFormat extends TermVectorsFormat {
+
+  @Override
+  public TermVectorsReader vectorsReader(Directory directory,SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException {
+    return new Lucene3xTermVectorsReader(directory, segmentInfo, fieldInfos, context);
+  }
+
+  @Override
+  public TermVectorsWriter vectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
+    // TODO all these IAEs in preflex should be UOEs?
+    throw new IllegalArgumentException("this codec can only be used for reading");
+  }
+
+  @Override
+  public void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
+    Lucene3xTermVectorsReader.files(dir, info, files);
+  }
+  
+}
--- a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsReader.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsReader.java
@ -0,0 +1,672 @@
+package org.apache.lucene.codecs.lucene3x;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.codecs.TermVectorsReader;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.FieldsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.IndexFormatTooNewException;
+import org.apache.lucene.index.IndexFormatTooOldException;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+
+public class Lucene3xTermVectorsReader extends TermVectorsReader {
+
+  // NOTE: if you make a new format, it must be larger than
+  // the current format
+
+  // Changed strings to UTF8 with length-in-bytes not length-in-chars
+  static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
+
+  // NOTE: always change this if you switch to a new format!
+  // whenever you add a new format, make it 1 larger (positive version logic)!
+  public static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
+  
+  // when removing support for old versions, leave the last supported version here
+  public static final int FORMAT_MINIMUM = FORMAT_UTF8_LENGTH_IN_BYTES;
+
+  //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file 
+  static final int FORMAT_SIZE = 4;
+
+  public static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
+
+  public static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
+  
+  /** Extension of vectors fields file */
+  public static final String VECTORS_FIELDS_EXTENSION = "tvf";
+
+  /** Extension of vectors documents file */
+  public static final String VECTORS_DOCUMENTS_EXTENSION = "tvd";
+
+  /** Extension of vectors index file */
+  public static final String VECTORS_INDEX_EXTENSION = "tvx";
+
+  private FieldInfos fieldInfos;
+
+  private IndexInput tvx;
+  private IndexInput tvd;
+  private IndexInput tvf;
+  private int size;
+  private int numTotalDocs;
+
+  // The docID offset where our docs begin in the index
+  // file.  This will be 0 if we have our own private file.
+  private int docStoreOffset;
+  
+  private final int format;
+
+  // used by clone
+  Lucene3xTermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int docStoreOffset, int format) {
+    this.fieldInfos = fieldInfos;
+    this.tvx = tvx;
+    this.tvd = tvd;
+    this.tvf = tvf;
+    this.size = size;
+    this.numTotalDocs = numTotalDocs;
+    this.docStoreOffset = docStoreOffset;
+    this.format = format;
+  }
+    
+  public Lucene3xTermVectorsReader(Directory d, SegmentInfo si, FieldInfos fieldInfos, IOContext context)
+    throws CorruptIndexException, IOException {
+    final String segment = si.getDocStoreSegment();
+    final int docStoreOffset = si.getDocStoreOffset();
+    final int size = si.docCount;
+    
+    boolean success = false;
+
+    try {
+      String idxName = IndexFileNames.segmentFileName(segment, "", VECTORS_INDEX_EXTENSION);
+      tvx = d.openInput(idxName, context);
+      format = checkValidFormat(tvx);
+      String fn = IndexFileNames.segmentFileName(segment, "", VECTORS_DOCUMENTS_EXTENSION);
+      tvd = d.openInput(fn, context);
+      final int tvdFormat = checkValidFormat(tvd);
+      fn = IndexFileNames.segmentFileName(segment, "", VECTORS_FIELDS_EXTENSION);
+      tvf = d.openInput(fn, context);
+      final int tvfFormat = checkValidFormat(tvf);
+
+      assert format == tvdFormat;
+      assert format == tvfFormat;
+
+      numTotalDocs = (int) (tvx.length() >> 4);
+
+      if (-1 == docStoreOffset) {
+        this.docStoreOffset = 0;
+        this.size = numTotalDocs;
+        assert size == 0 || numTotalDocs == size;
+      } else {
+        this.docStoreOffset = docStoreOffset;
+        this.size = size;
+        // Verify the file is long enough to hold all of our
+        // docs
+        assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset;
+      }
+
+      this.fieldInfos = fieldInfos;
+      success = true;
+    } finally {
+      // With lock-less commits, it's entirely possible (and
+      // fine) to hit a FileNotFound exception above. In
+      // this case, we want to explicitly close any subset
+      // of things that were opened so that we don't have to
+      // wait for a GC to do so.
+      if (!success) {
+        close();
+      }
+    }
+  }
+
+  // Not private to avoid synthetic access$NNN methods
+  void seekTvx(final int docNum) throws IOException {
+    tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
+  }
+
+  private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
+  {
+    int format = in.readInt();
+    if (format < FORMAT_MINIMUM)
+      throw new IndexFormatTooOldException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
+    if (format > FORMAT_CURRENT)
+      throw new IndexFormatTooNewException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
+    return format;
+  }
+
+  public void close() throws IOException {
+    IOUtils.close(tvx, tvd, tvf);
+  }
+
+  /**
+   * 
+   * @return The number of documents in the reader
+   */
+  int size() {
+    return size;
+  }
+
+  private class TVFields extends Fields {
+    private final int[] fieldNumbers;
+    private final long[] fieldFPs;
+    private final Map<Integer,Integer> fieldNumberToIndex = new HashMap<Integer,Integer>();
+
+    public TVFields(int docID) throws IOException {
+      seekTvx(docID);
+      tvd.seek(tvx.readLong());
+      
+      final int fieldCount = tvd.readVInt();
+      assert fieldCount >= 0;
+      if (fieldCount != 0) {
+        fieldNumbers = new int[fieldCount];
+        fieldFPs = new long[fieldCount];
+        for(int fieldUpto=0;fieldUpto<fieldCount;fieldUpto++) {
+          final int fieldNumber = tvd.readVInt();
+          fieldNumbers[fieldUpto] = fieldNumber;
+          fieldNumberToIndex.put(fieldNumber, fieldUpto);
+        }
+
+        long position = tvx.readLong();
+        fieldFPs[0] = position;
+        for(int fieldUpto=1;fieldUpto<fieldCount;fieldUpto++) {
+          position += tvd.readVLong();
+          fieldFPs[fieldUpto] = position;
+        }
+      } else {
+        // TODO: we can improve writer here, eg write 0 into
+        // tvx file, so we know on first read from tvx that
+        // this doc has no TVs
+        fieldNumbers = null;
+        fieldFPs = null;
+      }
+    }
+    
+    @Override
+    public FieldsEnum iterator() throws IOException {
+
+      return new FieldsEnum() {
+        private int fieldUpto;
+
+        @Override
+        public String next() throws IOException {
+          if (fieldNumbers != null && fieldUpto < fieldNumbers.length) {
+            return fieldInfos.fieldName(fieldNumbers[fieldUpto++]);
+          } else {
+            return null;
+          }
+        }
+
+        @Override
+        public Terms terms() throws IOException {
+          return TVFields.this.terms(fieldInfos.fieldName(fieldNumbers[fieldUpto-1]));
+        }
+      };
+    }
+
+    @Override
+    public Terms terms(String field) throws IOException {
+      final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+      if (fieldInfo == null) {
+        // No such field
+        return null;
+      }
+
+      final Integer fieldIndex = fieldNumberToIndex.get(fieldInfo.number);
+      if (fieldIndex == null) {
+        // Term vectors were not indexed for this field
+        return null;
+      }
+
+      return new TVTerms(fieldFPs[fieldIndex]);
+    }
+
+    @Override
+    public int getUniqueFieldCount() {
+      if (fieldNumbers == null) {
+        return 0;
+      } else {
+        return fieldNumbers.length;
+      }
+    }
+  }
+
+  private class TVTerms extends Terms {
+    private final int numTerms;
+    private final long tvfFPStart;
+
+    public TVTerms(long tvfFP) throws IOException {
+      tvf.seek(tvfFP);
+      numTerms = tvf.readVInt();
+      tvfFPStart = tvf.getFilePointer();
+    }
+
+    @Override
+    public TermsEnum iterator(TermsEnum reuse) throws IOException {
+      TVTermsEnum termsEnum;
+      if (reuse instanceof TVTermsEnum) {
+        termsEnum = (TVTermsEnum) reuse;
+        if (!termsEnum.canReuse(tvf)) {
+          termsEnum = new TVTermsEnum();
+        }
+      } else {
+        termsEnum = new TVTermsEnum();
+      }
+      termsEnum.reset(numTerms, tvfFPStart);
+      return termsEnum;
+    }
+
+    @Override
+    public long getUniqueTermCount() {
+      return numTerms;
+    }
+
+    @Override
+    public long getSumTotalTermFreq() {
+      return -1;
+    }
+
+    @Override
+    public long getSumDocFreq() {
+      // Every term occurs in just one doc:
+      return numTerms;
+    }
+
+    @Override
+    public int getDocCount() {
+      return 1;
+    }
+
+    @Override
+    public Comparator<BytesRef> getComparator() {
+      // TODO: really indexer hardwires
+      // this...?  I guess codec could buffer and re-sort...
+      return BytesRef.getUTF8SortedAsUnicodeComparator();
+    }
+  }
+
+  private class TVTermsEnum extends TermsEnum {
+    private final IndexInput origTVF;
+    private final IndexInput tvf;
+    private int numTerms;
+    private int nextTerm;
+    private int freq;
+    private BytesRef lastTerm = new BytesRef();
+    private BytesRef term = new BytesRef();
+    private boolean storePositions;
+    private boolean storeOffsets;
+    private long tvfFP;
+
+    private int[] positions;
+    private int[] startOffsets;
+    private int[] endOffsets;
+
+    // NOTE: tvf is pre-positioned by caller
+    public TVTermsEnum() throws IOException {
+      this.origTVF = Lucene3xTermVectorsReader.this.tvf;
+      tvf = (IndexInput) origTVF.clone();
+    }
+
+    public boolean canReuse(IndexInput tvf) {
+      return tvf == origTVF;
+    }
+
+    public void reset(int numTerms, long tvfFPStart) throws IOException {
+      this.numTerms = numTerms;
+      nextTerm = 0;
+      tvf.seek(tvfFPStart);
+      final byte bits = tvf.readByte();
+      storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
+      storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
+      tvfFP = 1+tvfFPStart;
+      positions = null;
+      startOffsets = null;
+      endOffsets = null;
+    }
+
+    // NOTE: slow!  (linear scan)
+    @Override
+    public SeekStatus seekCeil(BytesRef text, boolean useCache)
+      throws IOException {
+      if (nextTerm != 0 && text.compareTo(term) < 0) {
+        nextTerm = 0;
+        tvf.seek(tvfFP);
+      }
+
+      while (next() != null) {
+        final int cmp = text.compareTo(term);
+        if (cmp < 0) {
+          return SeekStatus.NOT_FOUND;
+        } else if (cmp == 0) {
+          return SeekStatus.FOUND;
+        }
+      }
+
+      return SeekStatus.END;
+    }
+
+    @Override
+    public void seekExact(long ord) {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public BytesRef next() throws IOException {
+      if (nextTerm >= numTerms) {
+        return null;
+      }
+      term.copyBytes(lastTerm);
+      final int start = tvf.readVInt();
+      final int deltaLen = tvf.readVInt();
+      term.length = start + deltaLen;
+      term.grow(term.length);
+      tvf.readBytes(term.bytes, start, deltaLen);
+      freq = tvf.readVInt();
+
+      if (storePositions) {
+        // TODO: we could maybe reuse last array, if we can
+        // somehow be careful about consumer never using two
+        // D&PEnums at once...
+        positions = new int[freq];
+        int pos = 0;
+        for(int posUpto=0;posUpto<freq;posUpto++) {
+          pos += tvf.readVInt();
+          positions[posUpto] = pos;
+        }
+      }
+
+      if (storeOffsets) {
+        startOffsets = new int[freq];
+        endOffsets = new int[freq];
+        int offset = 0;
+        for(int posUpto=0;posUpto<freq;posUpto++) {
+          startOffsets[posUpto] = offset + tvf.readVInt();
+          offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.readVInt();
+        }
+      }
+
+      lastTerm.copyBytes(term);
+      nextTerm++;
+      return term;
+    }
+
+    @Override
+    public BytesRef term() {
+      return term;
+    }
+
+    @Override
+    public long ord() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int docFreq() {
+      return 1;
+    }
+
+    @Override
+    public long totalTermFreq() {
+      return freq;
+    }
+
+    @Override
+    public DocsEnum docs(Bits liveDocs, DocsEnum reuse, boolean needsFreqs /* ignored */) throws IOException {
+      TVDocsEnum docsEnum;
+      if (reuse != null && reuse instanceof TVDocsEnum) {
+        docsEnum = (TVDocsEnum) reuse;
+      } else {
+        docsEnum = new TVDocsEnum();
+      }
+      docsEnum.reset(liveDocs, freq);
+      return docsEnum;
+    }
+
+    @Override
+    public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
+      if (needsOffsets && !storeOffsets) {
+        return null;
+      }
+
+      if (!storePositions && !storeOffsets) {
+        return null;
+      }
+      
+      TVDocsAndPositionsEnum docsAndPositionsEnum;
+      if (reuse != null && reuse instanceof TVDocsAndPositionsEnum) {
+        docsAndPositionsEnum = (TVDocsAndPositionsEnum) reuse;
+      } else {
+        docsAndPositionsEnum = new TVDocsAndPositionsEnum();
+      }
+      docsAndPositionsEnum.reset(liveDocs, positions, startOffsets, endOffsets);
+      return docsAndPositionsEnum;
+    }
+
+    @Override
+    public Comparator<BytesRef> getComparator() {
+      // TODO: really indexer hardwires
+      // this...?  I guess codec could buffer and re-sort...
+      return BytesRef.getUTF8SortedAsUnicodeComparator();
+    }
+  }
+
+  // NOTE: sort of a silly class, since you can get the
+  // freq() already by TermsEnum.totalTermFreq
+  private static class TVDocsEnum extends DocsEnum {
+    private boolean didNext;
+    private int doc = -1;
+    private int freq;
+    private Bits liveDocs;
+
+    @Override
+    public int freq() {
+      return freq;
+    }
+
+    @Override
+    public int docID() {
+      return doc;
+    }
+
+    @Override
+    public int nextDoc() {
+      if (!didNext && (liveDocs == null || liveDocs.get(0))) {
+        didNext = true;
+        return (doc = 0);
+      } else {
+        return (doc = NO_MORE_DOCS);
+      }
+    }
+
+    @Override
+    public int advance(int target) {
+      if (!didNext && target == 0) {
+        return nextDoc();
+      } else {
+        return (doc = NO_MORE_DOCS);
+      }
+    }
+
+    public void reset(Bits liveDocs, int freq) {
+      this.liveDocs = liveDocs;
+      this.freq = freq;
+      this.doc = -1;
+      didNext = false;
+    }
+  }
+
+  private static class TVDocsAndPositionsEnum extends DocsAndPositionsEnum {
+    private boolean didNext;
+    private int doc = -1;
+    private int nextPos;
+    private Bits liveDocs;
+    private int[] positions;
+    private int[] startOffsets;
+    private int[] endOffsets;
+
+    @Override
+    public int freq() {
+      if (positions != null) {
+        return positions.length;
+      } else {
+        assert startOffsets != null;
+        return startOffsets.length;
+      }
+    }
+
+    @Override
+    public int docID() {
+      return doc;
+    }
+
+    @Override
+    public int nextDoc() {
+      if (!didNext && (liveDocs == null || liveDocs.get(0))) {
+        didNext = true;
+        return (doc = 0);
+      } else {
+        return (doc = NO_MORE_DOCS);
+      }
+    }
+
+    @Override
+    public int advance(int target) {
+      if (!didNext && target == 0) {
+        return nextDoc();
+      } else {
+        return (doc = NO_MORE_DOCS);
+      }
+    }
+
+    public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets) {
+      this.liveDocs = liveDocs;
+      this.positions = positions;
+      this.startOffsets = startOffsets;
+      this.endOffsets = endOffsets;
+      this.doc = -1;
+      didNext = false;
+      nextPos = 0;
+    }
+
+    @Override
+    public BytesRef getPayload() {
+      return null;
+    }
+
+    @Override
+    public boolean hasPayload() {
+      return false;
+    }
+
+    @Override
+    public int nextPosition() {
+      assert (positions != null && nextPos < positions.length) ||
+        startOffsets != null && nextPos < startOffsets.length;
+
+      if (positions != null) {
+        return positions[nextPos++];
+      } else {
+        nextPos++;
+        return -1;
+      }
+    }
+
+    @Override
+    public int startOffset() {
+      assert startOffsets != null;
+      return startOffsets[nextPos-1];
+    }
+
+    @Override
+    public int endOffset() {
+      assert endOffsets != null;
+      return endOffsets[nextPos-1];
+    }
+  }
+
+  @Override
+  public Fields get(int docID) throws IOException {
+    if (docID < 0 || docID >= numTotalDocs) {
+      throw new IllegalArgumentException("doID=" + docID + " is out of bounds [0.." + (numTotalDocs-1) + "]");
+    }
+    if (tvx != null) {
+      Fields fields = new TVFields(docID);
+      if (fields.getUniqueFieldCount() == 0) {
+        // TODO: we can improve writer here, eg write 0 into
+        // tvx file, so we know on first read from tvx that
+        // this doc has no TVs
+        return null;
+      } else {
+        return fields;
+      }
+    } else {
+      return null;
+    }
+  }
+
+  @Override
+  public TermVectorsReader clone() {
+    IndexInput cloneTvx = null;
+    IndexInput cloneTvd = null;
+    IndexInput cloneTvf = null;
+
+    // These are null when a TermVectorsReader was created
+    // on a segment that did not have term vectors saved
+    if (tvx != null && tvd != null && tvf != null) {
+      cloneTvx = (IndexInput) tvx.clone();
+      cloneTvd = (IndexInput) tvd.clone();
+      cloneTvf = (IndexInput) tvf.clone();
+    }
+    
+    return new Lucene3xTermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, docStoreOffset, format);
+  }
+  
+  public static void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
+    if (info.getHasVectors()) {
+      if (info.getDocStoreOffset() != -1) {
+        assert info.getDocStoreSegment() != null;
+        if (!info.getDocStoreIsCompoundFile()) {
+          files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_INDEX_EXTENSION));
+          files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_FIELDS_EXTENSION));
+          files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_DOCUMENTS_EXTENSION));
+        }
+      } else {
+        files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_INDEX_EXTENSION));
+        files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_FIELDS_EXTENSION));
+        files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_DOCUMENTS_EXTENSION));
+      }
+    }
+  }
+}
+
--- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java
@ -74,8 +74,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
  static final String VECTORS_DOCUMENTS_EXTENSION = "tvd";

  /** Extension of vectors index file */
-  // TODO: shouldnt be visible to segments reader, preflex should do this itself somehow
-  public static final String VECTORS_INDEX_EXTENSION = "tvx";
+  static final String VECTORS_INDEX_EXTENSION = "tvx";

  private FieldInfos fieldInfos;

@ -85,28 +84,22 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
  private int size;
  private int numTotalDocs;
  
-  // The docID offset where our docs begin in the index
-  // file.  This will be 0 if we have our own private file.
-  private int docStoreOffset;
-  
  private final int format;

  // used by clone
-  Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int docStoreOffset, int format) {
+  Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int format) {
    this.fieldInfos = fieldInfos;
    this.tvx = tvx;
    this.tvd = tvd;
    this.tvf = tvf;
    this.size = size;
    this.numTotalDocs = numTotalDocs;
-    this.docStoreOffset = docStoreOffset;
    this.format = format;
  }
    
  public Lucene40TermVectorsReader(Directory d, SegmentInfo si, FieldInfos fieldInfos, IOContext context)
    throws CorruptIndexException, IOException {
-    final String segment = si.getDocStoreSegment();
-    final int docStoreOffset = si.getDocStoreOffset();
+    final String segment = si.name;
    final int size = si.docCount;
    
    boolean success = false;
@ -127,17 +120,8 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {

      numTotalDocs = (int) (tvx.length() >> 4);

-      if (-1 == docStoreOffset) {
-        this.docStoreOffset = 0;
-        this.size = numTotalDocs;
-        assert size == 0 || numTotalDocs == size;
-      } else {
-        this.docStoreOffset = docStoreOffset;
-        this.size = size;
-        // Verify the file is long enough to hold all of our
-        // docs
-        assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset;
-      }
+      this.size = numTotalDocs;
+      assert size == 0 || numTotalDocs == size;

      this.fieldInfos = fieldInfos;
      success = true;
@ -165,7 +149,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {

  // Not private to avoid synthetic access$NNN methods
  void seekTvx(final int docNum) throws IOException {
-    tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
+    tvx.seek(docNum * 16L + FORMAT_SIZE);
  }

  boolean canReadRawDocs() {
@ -201,7 +185,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {

    int count = 0;
    while (count < numDocs) {
-      final int docID = docStoreOffset + startDocID + count + 1;
+      final int docID = startDocID + count + 1;
      assert docID <= numTotalDocs;
      if (docID < numTotalDocs)  {
        tvdPosition = tvx.readLong();
@ -712,23 +696,14 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
      cloneTvf = (IndexInput) tvf.clone();
    }
    
-    return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, docStoreOffset, format);
+    return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, format);
  }
  
  public static void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
    if (info.getHasVectors()) {
-      if (info.getDocStoreOffset() != -1) {
-        assert info.getDocStoreSegment() != null;
-        if (!info.getDocStoreIsCompoundFile()) {
-          files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_INDEX_EXTENSION));
-          files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_FIELDS_EXTENSION));
-          files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_DOCUMENTS_EXTENSION));
-        }
-      } else {
-        files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_INDEX_EXTENSION));
-        files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_FIELDS_EXTENSION));
-        files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_DOCUMENTS_EXTENSION));
-      }
+      files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_INDEX_EXTENSION));
+      files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_FIELDS_EXTENSION));
+      files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_DOCUMENTS_EXTENSION));
    }
  }
 }
--- a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWCodec.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWCodec.java
@ -20,6 +20,7 @@ package org.apache.lucene.codecs.preflexrw;
 import org.apache.lucene.codecs.FieldInfosFormat;
 import org.apache.lucene.codecs.NormsFormat;
 import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.TermVectorsFormat;
 import org.apache.lucene.codecs.lucene3x.Lucene3xCodec;
 import org.apache.lucene.util.LuceneTestCase;

@ -31,6 +32,7 @@ public class PreFlexRWCodec extends Lucene3xCodec {
  private final PostingsFormat postings = new PreFlexRWPostingsFormat();
  private final NormsFormat norms = new PreFlexRWNormsFormat();
  private final FieldInfosFormat fieldInfos = new PreFlexRWFieldInfosFormat();
+  private final TermVectorsFormat termVectors = new PreFlexRWTermVectorsFormat();
  
  @Override
  public PostingsFormat postingsFormat() {
@ -58,4 +60,13 @@ public class PreFlexRWCodec extends Lucene3xCodec {
      return super.fieldInfosFormat();
    }
  }
+
+  @Override
+  public TermVectorsFormat termVectorsFormat() {
+    if (LuceneTestCase.PREFLEX_IMPERSONATION_IS_ACTIVE) {
+      return termVectors;
+    } else {
+      return super.termVectorsFormat();
+    }
+  }
 }
--- a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsFormat.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsFormat.java
@ -0,0 +1,33 @@
+package org.apache.lucene.codecs.preflexrw;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.TermVectorsWriter;
+import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsFormat;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+
+public class PreFlexRWTermVectorsFormat extends Lucene3xTermVectorsFormat {
+
+  @Override
+  public TermVectorsWriter vectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
+    return new PreFlexRWTermVectorsWriter(directory, segment, context);
+  }
+}
--- a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsWriter.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsWriter.java
@ -0,0 +1,216 @@
+package org.apache.lucene.codecs.preflexrw;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.TermVectorsWriter;
+import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsReader;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.StringHelper;
+
+// TODO: surrogates dance!
+public final class PreFlexRWTermVectorsWriter extends TermVectorsWriter {
+  private final Directory directory;
+  private final String segment;
+  private IndexOutput tvx = null, tvd = null, tvf = null;
+
+  public PreFlexRWTermVectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
+    this.directory = directory;
+    this.segment = segment;
+    boolean success = false;
+    try {
+      // Open files for TermVector storage
+      tvx = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_INDEX_EXTENSION), context);
+      tvx.writeInt(Lucene3xTermVectorsReader.FORMAT_CURRENT);
+      tvd = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_DOCUMENTS_EXTENSION), context);
+      tvd.writeInt(Lucene3xTermVectorsReader.FORMAT_CURRENT);
+      tvf = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_FIELDS_EXTENSION), context);
+      tvf.writeInt(Lucene3xTermVectorsReader.FORMAT_CURRENT);
+      success = true;
+    } finally {
+      if (!success) {
+        abort();
+      }
+    }
+  }
+ 
+  @Override
+  public void startDocument(int numVectorFields) throws IOException {
+    lastFieldName = null;
+    this.numVectorFields = numVectorFields;
+    tvx.writeLong(tvd.getFilePointer());
+    tvx.writeLong(tvf.getFilePointer());
+    tvd.writeVInt(numVectorFields);
+    fieldCount = 0;
+    fps = ArrayUtil.grow(fps, numVectorFields);
+  }
+  
+  private long fps[] = new long[10]; // pointers to the tvf before writing each field 
+  private int fieldCount = 0;        // number of fields we have written so far for this document
+  private int numVectorFields = 0;   // total number of fields we will write for this document
+  private String lastFieldName;
+
+  @Override
+  public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets) throws IOException {
+    assert lastFieldName == null || info.name.compareTo(lastFieldName) > 0: "fieldName=" + info.name + " lastFieldName=" + lastFieldName;
+    lastFieldName = info.name;
+    this.positions = positions;
+    this.offsets = offsets;
+    lastTerm.length = 0;
+    fps[fieldCount++] = tvf.getFilePointer();
+    tvd.writeVInt(info.number);
+    tvf.writeVInt(numTerms);
+    byte bits = 0x0;
+    if (positions)
+      bits |= Lucene3xTermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
+    if (offsets)
+      bits |= Lucene3xTermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
+    tvf.writeByte(bits);
+    
+    assert fieldCount <= numVectorFields;
+    if (fieldCount == numVectorFields) {
+      // last field of the document
+      // this is crazy because the file format is crazy!
+      for (int i = 1; i < fieldCount; i++) {
+        tvd.writeVLong(fps[i] - fps[i-1]);
+      }
+    }
+  }
+  
+  private final BytesRef lastTerm = new BytesRef(10);
+
+  // NOTE: we override addProx, so we don't need to buffer when indexing.
+  // we also don't buffer during bulk merges.
+  private int offsetStartBuffer[] = new int[10];
+  private int offsetEndBuffer[] = new int[10];
+  private int offsetIndex = 0;
+  private int offsetFreq = 0;
+  private boolean positions = false;
+  private boolean offsets = false;
+
+  @Override
+  public void startTerm(BytesRef term, int freq) throws IOException {
+    final int prefix = StringHelper.bytesDifference(lastTerm, term);
+    final int suffix = term.length - prefix;
+    tvf.writeVInt(prefix);
+    tvf.writeVInt(suffix);
+    tvf.writeBytes(term.bytes, term.offset + prefix, suffix);
+    tvf.writeVInt(freq);
+    lastTerm.copyBytes(term);
+    lastPosition = lastOffset = 0;
+    
+    if (offsets && positions) {
+      // we might need to buffer if its a non-bulk merge
+      offsetStartBuffer = ArrayUtil.grow(offsetStartBuffer, freq);
+      offsetEndBuffer = ArrayUtil.grow(offsetEndBuffer, freq);
+      offsetIndex = 0;
+      offsetFreq = freq;
+    }
+  }
+
+  int lastPosition = 0;
+  int lastOffset = 0;
+
+  @Override
+  public void addProx(int numProx, DataInput positions, DataInput offsets) throws IOException {
+    // TODO: technically we could just copy bytes and not re-encode if we knew the length...
+    if (positions != null) {
+      for (int i = 0; i < numProx; i++) {
+        tvf.writeVInt(positions.readVInt());
+      }
+    }
+    
+    if (offsets != null) {
+      for (int i = 0; i < numProx; i++) {
+        tvf.writeVInt(offsets.readVInt());
+        tvf.writeVInt(offsets.readVInt());
+      }
+    }
+  }
+
+  @Override
+  public void addPosition(int position, int startOffset, int endOffset) throws IOException {
+    if (positions && offsets) {
+      // write position delta
+      tvf.writeVInt(position - lastPosition);
+      lastPosition = position;
+      
+      // buffer offsets
+      offsetStartBuffer[offsetIndex] = startOffset;
+      offsetEndBuffer[offsetIndex] = endOffset;
+      offsetIndex++;
+      
+      // dump buffer if we are done
+      if (offsetIndex == offsetFreq) {
+        for (int i = 0; i < offsetIndex; i++) {
+          tvf.writeVInt(offsetStartBuffer[i] - lastOffset);
+          tvf.writeVInt(offsetEndBuffer[i] - offsetStartBuffer[i]);
+          lastOffset = offsetEndBuffer[i];
+        }
+      }
+    } else if (positions) {
+      // write position delta
+      tvf.writeVInt(position - lastPosition);
+      lastPosition = position;
+    } else if (offsets) {
+      // write offset deltas
+      tvf.writeVInt(startOffset - lastOffset);
+      tvf.writeVInt(endOffset - startOffset);
+      lastOffset = endOffset;
+    }
+  }
+
+  @Override
+  public void abort() {
+    try {
+      close();
+    } catch (IOException ignored) {}
+    IOUtils.deleteFilesIgnoringExceptions(directory, IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_INDEX_EXTENSION),
+        IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_DOCUMENTS_EXTENSION),
+        IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_FIELDS_EXTENSION));
+  }
+  
+  @Override
+  public void finish(int numDocs) throws IOException {
+    if (4+((long) numDocs)*16 != tvx.getFilePointer())
+      // This is most likely a bug in Sun JRE 1.6.0_04/_05;
+      // we detect that the bug has struck, here, and
+      // throw an exception to prevent the corruption from
+      // entering the index.  See LUCENE-1282 for
+      // details.
+      throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + numDocs + " but tvx size is " + tvx.getFilePointer() + " file=" + tvx.toString() + "; now aborting this merge to prevent index corruption");
+  }
+
+  /** Close all streams. */
+  @Override
+  public void close() throws IOException {
+    // make an effort to close all streams we can but remember and re-throw
+    // the first exception encountered in this process
+    IOUtils.close(tvx, tvd, tvf);
+    tvx = tvd = tvf = null;
+  }
+}