LUCENE-755: Added the ability to store arbitrary binary metadata (payloads) in the posting list.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@518486 13f79535-47bb-0310-9956-ffa450edef68
2007-03-15 05:15:43 +00:00 · 2007-03-15 05:15:43 +00:00 · eb20c06a62
parent 9da8211775
commit eb20c06a62
23 changed files with 1169 additions and 116 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -82,6 +82,13 @@ New features
 2. LUCENE-822: Added FieldSelector capabilities to Searchable for use with RemoteSearcher, and other Searchable implementations. (Mark Miller, Grant Ingersoll)
 3. LUCENE-755: Added the ability to store arbitrary binary metadata in the posting list.
    These metadata are called Payloads. For every position of a Token one Payload in the form
    of a variable length byte array can be stored in the prox file.
    Remark: The APIs introduced with this feature are in experimental state and thus
            contain appropriate warnings in the javadocs.
    (Michael Busch)
 Optimizations
 1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions
--- a/src/java/org/apache/lucene/analysis/Token.java
+++ b/src/java/org/apache/lucene/analysis/Token.java
@ -1,5 +1,8 @@
 package org.apache.lucene.analysis;
 import org.apache.lucene.index.Payload;
 import org.apache.lucene.index.TermPositions;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
@ -20,23 +23,40 @@ package org.apache.lucene.analysis;
 /** A Token is an occurence of a term from the text of a field.  It consists of
  a term's text, the start and end offset of the term in the text of the field,
  and a type string.
-
+  <p>
  The start and end offsets permit applications to re-associate a token with
  its source text, e.g., to display highlighted query terms in a document
  browser, or to show matching text fragments in a KWIC (KeyWord In Context)
  display, etc.
-
+  <p>
  The type is an interned string, assigned by a lexical analyzer
  (a.k.a. tokenizer), naming the lexical or syntactic class that the token
  belongs to.  For example an end of sentence marker token might be implemented
-  with type "eos".  The default token type is "word".  */
+  with type "eos".  The default token type is "word".  
  <p>
  A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
  length byte array. Use {@link TermPositions#getPayloadLength()} and 
  {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
  <br><br>
  <b>
  Warning: The status of the Payloads feature is experimental. The APIs
  introduced here might change in the future and will not be supported anymore
  in such a case. If you want to use this feature in a production environment
  you should wait for an official release.
  </b> 
  @see org.apache.lucene.index.Payload
  */
  // TODO: Remove warning after API has been finalized
 public class Token implements Cloneable {
  String termText;				  // the text of the term
  int startOffset;				  // start in source text
  int endOffset;				  // end in source text
  String type = "word";				  // lexical type
-
+  
  Payload payload;
  private int positionIncrement = 1;
  /** Constructs a Token with the given term text, and start & end offsets.
@ -115,6 +135,36 @@ public class Token implements Cloneable {
  /** Returns this Token's lexical type.  Defaults to "word". */
  public final String type() { return type; }
  /** 
   * Sets this Token's payload.<br>
   * <br>
   * <b>
   * Warning: The status of the Payloads feature is experimental. The APIs
   * introduced here might change in the future and will not be supported anymore
   * in such a case. If you want to use this feature in a production environment
   * you should wait for an official release.
   * </b>  
   */
  // TODO: Remove warning after API has been finalized
  public void setPayload(Payload payload) {
    this.payload = payload;
  }
  /** 
   * Returns this Token's payload.<br> 
   * <br>
   * <b>
   * Warning: The status of the Payloads feature is experimental. The APIs
   * introduced here might change in the future and will not be supported anymore
   * in such a case. If you want to use this feature in a production environment
   * you should wait for an official release.
   * </b>   
   */
  // TODO: Remove warning after API has been finalized
  public Payload getPayload() {
    return this.payload;
  }
  public String toString() {
    StringBuffer sb = new StringBuffer();
    sb.append("(" + termText + "," + startOffset + "," + endOffset);
--- a/src/java/org/apache/lucene/index/DocumentWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentWriter.java
@ -31,6 +31,7 @@ import java.io.PrintStream;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 import java.util.BitSet;
 import java.util.Enumeration;
 import java.util.Hashtable;
 import java.util.Iterator;
@ -69,9 +70,30 @@ final class DocumentWriter {
  final void addDocument(String segment, Document doc)
          throws CorruptIndexException, IOException {
-    // write field names
+    // create field infos
    fieldInfos = new FieldInfos();
    fieldInfos.add(doc);
    // invert doc into postingTable
    postingTable.clear();			  // clear postingTable
    fieldLengths = new int[fieldInfos.size()];    // init fieldLengths
    fieldPositions = new int[fieldInfos.size()];  // init fieldPositions
    fieldOffsets = new int[fieldInfos.size()];    // init fieldOffsets
    fieldStoresPayloads = new BitSet(fieldInfos.size());
    fieldBoosts = new float[fieldInfos.size()];	  // init fieldBoosts
    Arrays.fill(fieldBoosts, doc.getBoost());
    // Before we write the FieldInfos we invert the Document. The reason is that
    // during invertion the TokenStreams of tokenized fields are being processed 
    // and we might encounter tokens that have payloads associated with them. In 
    // this case we have to update the FieldInfo of the particular field.
    invertDocument(doc);
    // sort postingTable into an array
    Posting[] postings = sortPostingTable();
    // write field infos 
    fieldInfos.write(directory, segment + ".fnm");
    // write field values
@ -82,21 +104,7 @@ final class DocumentWriter {
    } finally {
      fieldsWriter.close();
    }
-
+    
    // invert doc into postingTable
    postingTable.clear();			  // clear postingTable
    fieldLengths = new int[fieldInfos.size()];    // init fieldLengths
    fieldPositions = new int[fieldInfos.size()];  // init fieldPositions
    fieldOffsets = new int[fieldInfos.size()];    // init fieldOffsets
    fieldBoosts = new float[fieldInfos.size()];	  // init fieldBoosts
    Arrays.fill(fieldBoosts, doc.getBoost());
    invertDocument(doc);
    // sort postingTable into an array
    Posting[] postings = sortPostingTable();
    /*
    for (int i = 0; i < postings.length; i++) {
      Posting posting = postings[i];
@ -125,6 +133,10 @@ final class DocumentWriter {
  private int[] fieldPositions;
  private int[] fieldOffsets;
  private float[] fieldBoosts;
  // If any of the tokens of a paticular field carry a payload
  // then we enable payloads for that field. 
  private BitSet fieldStoresPayloads;
  // Tokenizes the fields of a document into Postings.
  private final void invertDocument(Document doc)
@ -144,9 +156,9 @@ final class DocumentWriter {
        if (!field.isTokenized()) {		  // un-tokenized field
          String stringValue = field.stringValue();
          if(field.isStoreOffsetWithTermVector())
-            addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
+            addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
          else
-            addPosition(fieldName, stringValue, position++, null);
+            addPosition(fieldName, stringValue, position++, null, null);
          offset += stringValue.length();
          length++;
        } else 
@ -167,10 +179,19 @@ final class DocumentWriter {
            for (Token t = stream.next(); t != null; t = stream.next()) {
              position += (t.getPositionIncrement() - 1);
-              if(field.isStoreOffsetWithTermVector())
+              Payload payload = t.getPayload();
-                addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
+              if (payload != null) {
-              else
+                // enable payloads for this field
-                addPosition(fieldName, t.termText(), position++, null);
+              	fieldStoresPayloads.set(fieldNumber);
              }
              TermVectorOffsetInfo termVectorOffsetInfo;
              if (field.isStoreOffsetWithTermVector()) {
                termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset());
              } else {
                termVectorOffsetInfo = null;
              }
              addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo);
              lastToken = t;
              if (++length >= maxFieldLength) {
@ -194,11 +215,16 @@ final class DocumentWriter {
        fieldOffsets[fieldNumber] = offset;
      }
    }
    // update fieldInfos for all fields that have one or more tokens with payloads
    for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) { 
    	fieldInfos.fieldInfo(i).storePayloads = true;
    }
  }
  private final Term termBuffer = new Term("", ""); // avoid consing
-  private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
+  private final void addPosition(String field, String text, int position, Payload payload, TermVectorOffsetInfo offset) {
    termBuffer.set(field, text);
    //System.out.println("Offset: " + offset);
    Posting ti = (Posting) postingTable.get(termBuffer);
@ -209,9 +235,25 @@ final class DocumentWriter {
        int[] positions = ti.positions;
        System.arraycopy(positions, 0, newPositions, 0, freq);
        ti.positions = newPositions;
        if (ti.payloads != null) {
          // the current field stores payloads
          Payload[] newPayloads = new Payload[freq * 2];  // grow payloads array
          Payload[] payloads = ti.payloads;
          System.arraycopy(payloads, 0, newPayloads, 0, payloads.length);
          ti.payloads = newPayloads;
        }
      }
      ti.positions[freq] = position;		  // add new position
      if (payload != null) {
        if (ti.payloads == null) {
          // lazily allocate payload array
          ti.payloads = new Payload[ti.positions.length];
        }
        ti.payloads[freq] = payload;
      }
      if (offset != null) {
        if (ti.offsets.length == freq){
          TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
@ -224,7 +266,7 @@ final class DocumentWriter {
      ti.freq = freq + 1;			  // update frequency
    } else {					  // word not seen before
      Term term = new Term(field, text, false);
-      postingTable.put(term, new Posting(term, position, offset));
+      postingTable.put(term, new Posting(term, position, payload, offset));
    }
  }
@ -307,10 +349,31 @@ final class DocumentWriter {
                                termIndexInterval);
      TermInfo ti = new TermInfo();
      String currentField = null;
-
+      boolean currentFieldHasPayloads = false;
      for (int i = 0; i < postings.length; i++) {
        Posting posting = postings[i];
        // check to see if we switched to a new field
        String termField = posting.term.field();
        if (currentField != termField) {
          // changing field - see if there is something to save
          currentField = termField;
          FieldInfo fi = fieldInfos.fieldInfo(currentField);
          currentFieldHasPayloads = fi.storePayloads;
          if (fi.storeTermVector) {
            if (termVectorWriter == null) {
              termVectorWriter =
                new TermVectorsWriter(directory, segment, fieldInfos);
              termVectorWriter.openDocument();
            }
            termVectorWriter.openField(currentField);
          } else if (termVectorWriter != null) {
            termVectorWriter.closeField();
          }
        }
        // add an entry to the dictionary with pointers to prox and freq files
        ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
        tis.add(posting.term, ti);
@ -326,28 +389,62 @@ final class DocumentWriter {
        int lastPosition = 0;			  // write positions
        int[] positions = posting.positions;
        Payload[] payloads = posting.payloads;
        int lastPayloadLength = -1;
        // The following encoding is being used for positions and payloads:
        // Case 1: current field does not store payloads
        //           Positions     -> <PositionDelta>^freq
        //           PositionDelta -> VInt
        //         The PositionDelta is the difference between the current
        //         and the previous position
        // Case 2: current field stores payloads
        //           Positions     -> <PositionDelta, Payload>^freq
        //           Payload       ->  <PayloadLength?, PayloadData>
        //           PositionDelta -> VInt
        //           PayloadLength -> VInt
        //           PayloadData   -> byte^PayloadLength
        //         In this case PositionDelta/2 is the difference between
        //         the current and the previous position. If PositionDelta
        //         is odd, then a PayloadLength encoded as VInt follows,
        //         if PositionDelta is even, then it is assumed that the
        //         length of the current Payload equals the length of the
        //         previous Payload.        
        for (int j = 0; j < postingFreq; j++) {		  // use delta-encoding
          int position = positions[j];
-          prox.writeVInt(position - lastPosition);
+          int delta = position - lastPosition;
-          lastPosition = position;
+          if (currentFieldHasPayloads) {
-        }
+            int payloadLength = 0;
-        // check to see if we switched to a new field
+            Payload payload = null;
-        String termField = posting.term.field();
+            if (payloads != null) {
-        if (currentField != termField) {
+              payload = payloads[j];
-          // changing field - see if there is something to save
+              if (payload != null) {
-          currentField = termField;
+                payloadLength = payload.length;
-          FieldInfo fi = fieldInfos.fieldInfo(currentField);
+              }
          if (fi.storeTermVector) {
            if (termVectorWriter == null) {
              termVectorWriter =
                new TermVectorsWriter(directory, segment, fieldInfos);
              termVectorWriter.openDocument();
            }
-            termVectorWriter.openField(currentField);
+            if (payloadLength == lastPayloadLength) {
-
+            	// the length of the current payload equals the length
-          } else if (termVectorWriter != null) {
+            	// of the previous one. So we do not have to store the length
-            termVectorWriter.closeField();
+            	// again and we only shift the position delta by one bit
              prox.writeVInt(delta * 2);
            } else {
            	// the length of the current payload is different from the
            	// previous one. We shift the position delta, set the lowest
            	// bit and store the current payload length as VInt.
              prox.writeVInt(delta * 2 + 1);
              prox.writeVInt(payloadLength);
              lastPayloadLength = payloadLength;
            }
            if (payloadLength > 0) {
            	// write current payload
              prox.writeBytes(payload.data, payload.offset, payload.length);
            }
          } else {
          	// field does not store payloads, just write position delta as VInt
            prox.writeVInt(delta);
          }
          lastPosition = position;
        }
        if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
            termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
@ -397,18 +494,27 @@ final class Posting {				  // info about a Term in a doc
  Term term;					  // the Term
  int freq;					  // its frequency in doc
  int[] positions;				  // positions it occurs at
  Payload[] payloads; // the payloads of the terms
  TermVectorOffsetInfo [] offsets;
-  Posting(Term t, int position, TermVectorOffsetInfo offset) {
+  Posting(Term t, int position, Payload payload, TermVectorOffsetInfo offset) {
    term = t;
    freq = 1;
    positions = new int[1];
    positions[0] = position;
    if (payload != null) {
      payloads = new Payload[1];
      payloads[0] = payload;
    } else 
      payloads = null;    
    if(offset != null){
-    offsets = new TermVectorOffsetInfo[1];
+      offsets = new TermVectorOffsetInfo[1];
-    offsets[0] = offset;
+      offsets[0] = offset;
-    }
+    } else
    else
      offsets = null;
  }
 }
--- a/src/java/org/apache/lucene/index/FieldInfo.java
+++ b/src/java/org/apache/lucene/index/FieldInfo.java
@ -28,9 +28,12 @@ final class FieldInfo {
  boolean storePositionWithTermVector;
  boolean omitNorms; // omit norms associated with indexed fields
  boolean storePayloads; // whether this field stores payloads together with term positions
  FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, 
-            boolean storePositionWithTermVector,  boolean storeOffsetWithTermVector, boolean omitNorms) {
+            boolean storePositionWithTermVector,  boolean storeOffsetWithTermVector, 
            boolean omitNorms, boolean storePayloads) {
    name = na;
    isIndexed = tk;
    number = nu;
@ -38,5 +41,6 @@ final class FieldInfo {
    this.storeOffsetWithTermVector = storeOffsetWithTermVector;
    this.storePositionWithTermVector = storePositionWithTermVector;
    this.omitNorms = omitNorms;
    this.storePayloads = storePayloads;
  }
 }
--- a/src/java/org/apache/lucene/index/FieldInfos.java
+++ b/src/java/org/apache/lucene/index/FieldInfos.java
@ -39,6 +39,7 @@ final class FieldInfos {
  static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4;
  static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8;
  static final byte OMIT_NORMS = 0x10;
  static final byte STORE_PAYLOADS = 0x20;
  private ArrayList byNumber = new ArrayList();
  private HashMap byName = new HashMap();
@ -156,9 +157,29 @@ final class FieldInfos {
   */
  public void add(String name, boolean isIndexed, boolean storeTermVector,
                  boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) {
    add(name, isIndexed, storeTermVector, storePositionWithTermVector,
        storeOffsetWithTermVector, omitNorms, false);
  }
  /** If the field is not yet known, adds it. If it is known, checks to make
   *  sure that the isIndexed flag is the same as was given previously for this
   *  field. If not - marks it as being indexed.  Same goes for the TermVector
   * parameters.
   *
   * @param name The name of the field
   * @param isIndexed true if the field is indexed
   * @param storeTermVector true if the term vector should be stored
   * @param storePositionWithTermVector true if the term vector with positions should be stored
   * @param storeOffsetWithTermVector true if the term vector with offsets should be stored
   * @param omitNorms true if the norms for the indexed field should be omitted
   * @param storePayloads true if payloads should be stored for this field
   */
  public void add(String name, boolean isIndexed, boolean storeTermVector,
                  boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
                  boolean omitNorms, boolean storePayloads) {
    FieldInfo fi = fieldInfo(name);
    if (fi == null) {
-      addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms);
+      addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
    } else {
      if (fi.isIndexed != isIndexed) {
        fi.isIndexed = true;                      // once indexed, always index
@ -175,6 +196,9 @@ final class FieldInfos {
      if (fi.omitNorms != omitNorms) {
        fi.omitNorms = false;                // once norms are stored, always store
      }
      if (fi.storePayloads != storePayloads) {
        fi.storePayloads = true;
      }
    }
  }
@ -182,10 +206,10 @@ final class FieldInfos {
  private void addInternal(String name, boolean isIndexed,
                           boolean storeTermVector, boolean storePositionWithTermVector, 
-                           boolean storeOffsetWithTermVector, boolean omitNorms) {
+                           boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads) {
    FieldInfo fi =
      new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector,
-              storeOffsetWithTermVector, omitNorms);
+              storeOffsetWithTermVector, omitNorms, storePayloads);
    byNumber.add(fi);
    byName.put(name, fi);
  }
@ -271,6 +295,7 @@ final class FieldInfos {
      if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
      if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;
      if (fi.omitNorms) bits |= OMIT_NORMS;
      if (fi.storePayloads) bits |= STORE_PAYLOADS;
      output.writeString(fi.name);
      output.writeByte(bits);
    }
@ -286,8 +311,9 @@ final class FieldInfos {
      boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
      boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
      boolean omitNorms = (bits & OMIT_NORMS) != 0;
-
+      boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
-      addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms);
+      
      addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
    }    
  }
--- a/src/java/org/apache/lucene/index/FilterIndexReader.java
+++ b/src/java/org/apache/lucene/index/FilterIndexReader.java
@ -62,6 +62,14 @@ public class FilterIndexReader extends IndexReader {
    public int nextPosition() throws IOException {
      return ((TermPositions) this.in).nextPosition();
    }
    public int getPayloadLength() {
      return ((TermPositions) this.in).getPayloadLength();
    }
    public byte[] getPayload(byte[] data, int offset) throws IOException {
      return ((TermPositions) this.in).getPayload(data, offset);
    }
  }
  /** Base class for filtering {@link TermEnum} implementations. */
--- a/src/java/org/apache/lucene/index/IndexReader.java
+++ b/src/java/org/apache/lucene/index/IndexReader.java
@ -67,6 +67,8 @@ public abstract class IndexReader {
    public static final FieldOption ALL = new FieldOption ("ALL");
    // all indexed fields
    public static final FieldOption INDEXED = new FieldOption ("INDEXED");
    // all fields that store payloads
    public static final FieldOption STORES_PAYLOADS = new FieldOption ("STORES_PAYLOADS");
    // all fields which are not indexed
    public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED");
    // all fields which are indexed with termvectors enables
--- a/src/java/org/apache/lucene/index/MultiReader.java
+++ b/src/java/org/apache/lucene/index/MultiReader.java
@ -455,5 +455,12 @@ class MultiTermPositions extends MultiTermDocs implements TermPositions {
  public int nextPosition() throws IOException {
    return ((TermPositions)current).nextPosition();
  }
-
+  
  public int getPayloadLength() {
    return ((TermPositions)current).getPayloadLength();
  }
  public byte[] getPayload(byte[] data, int offset) throws IOException {
    return ((TermPositions)current).getPayload(data, offset);
  }
 }
--- a/src/java/org/apache/lucene/index/MultipleTermPositions.java
+++ b/src/java/org/apache/lucene/index/MultipleTermPositions.java
@ -191,5 +191,23 @@ public class MultipleTermPositions implements TermPositions {
  public int read(int[] arg0, int[] arg1) throws IOException {
    throw new UnsupportedOperationException();
  }
  /**
   * Not implemented.
   * @throws UnsupportedOperationException
   */
  public int getPayloadLength() {
    throw new UnsupportedOperationException();
  }
  /**
   * Not implemented.
   * @throws UnsupportedOperationException
   */
  public byte[] getPayload(byte[] data, int offset) throws IOException {
    throw new UnsupportedOperationException();
  }
 }
--- a/src/java/org/apache/lucene/index/ParallelReader.java
+++ b/src/java/org/apache/lucene/index/ParallelReader.java
@ -419,7 +419,15 @@ public class ParallelReader extends IndexReader {
      return ((TermPositions)termDocs).nextPosition();
    }
    public int getPayloadLength() {
      return ((TermPositions)termDocs).getPayloadLength();
    }
    public byte[] getPayload(byte[] data, int offset) throws IOException {
      return ((TermPositions)termDocs).getPayload(data, offset);
    }
  }
 }
--- a/src/java/org/apache/lucene/index/Payload.java
+++ b/src/java/org/apache/lucene/index/Payload.java
@ -0,0 +1,114 @@
 package org.apache.lucene.index;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.Serializable;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 /**
  *  A Payload is metadata that can be stored together with each occurrence 
  *  of a term. This metadata is stored inline in the posting list of the
  *  specific term.  
  *  <p>
  *  To store payloads in the index a {@link TokenStream} has to be used that
  *  produces {@link Token}s containing payload data.
  *  <p>
  *  Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
  *  to retrieve the payloads from the index.<br>
  *  <br>
  *  
  *  <b>
  *  Warning: The status of the Payloads feature is experimental. The APIs
  *  introduced here might change in the future and will not be supported anymore
  *  in such a case. If you want to use this feature in a production environment
  *  you should wait for an official release.
  *  </b>
  */    
  // TODO: Remove warning after API has been finalized
  public class Payload implements Serializable {
    protected byte[] data;
    protected int offset;
    protected int length;
    protected Payload() {
      // no-arg constructor since this class implements Serializable
    }
    /**
     * Creates a new payload with the the given array as data.
     * 
     * @param data the data of this payload
     */
    public Payload(byte[] data) {
      this(data, 0, data.length);
    }
    /**
     * Creates a new payload with the the given array as data. 
     * 
     * @param data the data of this payload
     * @param offset the offset in the data byte array
     * @param length the length of the data
     */
    public Payload(byte[] data, int offset, int length) {
      if (offset < 0 || offset + length > data.length) {
        throw new IllegalArgumentException();
      }
      this.data = data;
      this.offset = offset;
      this.length = length;
    }
    public int length() {
      return this.length;
    }
    /**
     * Returns the byte at the given index.
     */
    public byte byteAt(int index) {
      if (0 <= index && index < this.length) {
        return this.data[this.offset + index];    
      }
      throw new ArrayIndexOutOfBoundsException(index);
    }
    /**
     * Allocates a new byte array, copies the payload data into it and returns it. 
     */
    public byte[] toByteArray() {
      byte[] retArray = new byte[this.length];
      System.arraycopy(this.data, this.offset, retArray, 0, this.length);
      return retArray;
    }
    /**
     * Copies the payload data to a byte array.
     * 
     * @param target the target byte array
     * @param targetOffset the offset in the target byte array
     */
    public void copyTo(byte[] target, int targetOffset) {
      if (this.length > target.length + targetOffset) {
        throw new ArrayIndexOutOfBoundsException();
      }
      System.arraycopy(this.data, this.offset, target, targetOffset, this.length);
    }
 }
--- a/src/java/org/apache/lucene/index/SegmentMerger.java
+++ b/src/java/org/apache/lucene/index/SegmentMerger.java
@ -157,11 +157,11 @@ final class SegmentMerger {
  }
  private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector,
-                         boolean storeOffsetWithTermVector) throws IOException {
+                         boolean storeOffsetWithTermVector, boolean storePayloads) throws IOException {
    Iterator i = names.iterator();
    while (i.hasNext()) {
      String field = (String)i.next();
-      fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field));
+      fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads);
    }
  }
@ -176,11 +176,12 @@ final class SegmentMerger {
    int docCount = 0;
    for (int i = 0; i < readers.size(); i++) {
      IndexReader reader = (IndexReader) readers.elementAt(i);
-      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true);
+      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false);
-      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false);
+      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false);
-      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true);
+      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false);
-      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false);
+      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false);
-      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false);
+      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true);
      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false);
      fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false);
    }
    fieldInfos.write(directory, segment + ".fnm");
@ -326,6 +327,8 @@ final class SegmentMerger {
      termInfosWriter.add(smis[0].term, termInfo);
    }
  }
  private byte[] payloadBuffer = null;
  /** Process postings from multiple segments all positioned on the
   *  same term. Writes out merged entries into freqOutput and
@ -342,6 +345,8 @@ final class SegmentMerger {
    int lastDoc = 0;
    int df = 0;					  // number of docs w/ term
    resetSkip();
    boolean storePayloads = fieldInfos.fieldInfo(smis[0].term.field).storePayloads;
    int lastPayloadLength = -1;   // ensures that we write the first length
    for (int i = 0; i < n; i++) {
      SegmentMergeInfo smi = smis[i];
      TermPositions postings = smi.getPositions();
@ -361,7 +366,7 @@ final class SegmentMerger {
        df++;
        if ((df % skipInterval) == 0) {
-          bufferSkip(lastDoc);
+          bufferSkip(lastDoc, storePayloads, lastPayloadLength);
        }
        int docCode = (doc - lastDoc) << 1;	  // use low bit to flag freq=1
@ -374,11 +379,33 @@ final class SegmentMerger {
          freqOutput.writeVInt(docCode);	  // write doc
          freqOutput.writeVInt(freq);		  // write frequency in doc
        }
-
+        
        /** See {@link DocumentWriter#writePostings(Posting[], String) for 
         *  documentation about the encoding of positions and payloads
         */
        int lastPosition = 0;			  // write position deltas
        for (int j = 0; j < freq; j++) {
          int position = postings.nextPosition();
-          proxOutput.writeVInt(position - lastPosition);
+          int delta = position - lastPosition;
          if (storePayloads) {
            int payloadLength = postings.getPayloadLength();
            if (payloadLength == lastPayloadLength) {
              proxOutput.writeVInt(delta * 2);
            } else {
              proxOutput.writeVInt(delta * 2 + 1);
              proxOutput.writeVInt(payloadLength);
              lastPayloadLength = payloadLength;
            }
            if (payloadLength > 0) {
              if (payloadBuffer == null || payloadBuffer.length < payloadLength) {
                payloadBuffer = new byte[payloadLength];
              }
              postings.getPayload(payloadBuffer, 0);
              proxOutput.writeBytes(payloadBuffer, 0, payloadLength);
            }
          } else {
            proxOutput.writeVInt(delta);
          }
          lastPosition = position;
        }
      }
@ -388,21 +415,59 @@ final class SegmentMerger {
  private RAMOutputStream skipBuffer = new RAMOutputStream();
  private int lastSkipDoc;
  private int lastSkipPayloadLength;
  private long lastSkipFreqPointer;
  private long lastSkipProxPointer;
  private void resetSkip() {
    skipBuffer.reset();
    lastSkipDoc = 0;
    lastSkipPayloadLength = -1;  // we don't have to write the first length in the skip list
    lastSkipFreqPointer = freqOutput.getFilePointer();
    lastSkipProxPointer = proxOutput.getFilePointer();
  }
-  private void bufferSkip(int doc) throws IOException {
+  private void bufferSkip(int doc, boolean storePayloads, int payloadLength) throws IOException {
    long freqPointer = freqOutput.getFilePointer();
    long proxPointer = proxOutput.getFilePointer();
-    skipBuffer.writeVInt(doc - lastSkipDoc);
+    // To efficiently store payloads in the posting lists we do not store the length of
    // every payload. Instead we omit the length for a payload if the previous payload had
    // the same length.
    // However, in order to support skipping the payload length at every skip point must be known.
    // So we use the same length encoding that we use for the posting lists for the skip data as well:
    // Case 1: current field does not store payloads
    //           SkipDatum                 --> DocSkip, FreqSkip, ProxSkip
    //           DocSkip,FreqSkip,ProxSkip --> VInt
    //           DocSkip records the document number before every SkipInterval th  document in TermFreqs. 
    //           Document numbers are represented as differences from the previous value in the sequence.
    // Case 2: current field stores payloads
    //           SkipDatum                 --> DocSkip, PayloadLength?, FreqSkip,ProxSkip
    //           DocSkip,FreqSkip,ProxSkip --> VInt
    //           PayloadLength             --> VInt    
    //         In this case DocSkip/2 is the difference between
    //         the current and the previous value. If DocSkip
    //         is odd, then a PayloadLength encoded as VInt follows,
    //         if DocSkip is even, then it is assumed that the
    //         current payload length equals the length at the previous
    //         skip point
    if (storePayloads) {
      int delta = doc - lastSkipDoc;
      if (payloadLength == lastSkipPayloadLength) {
        // the current payload length equals the length at the previous skip point,
        // so we don't store the length again
        skipBuffer.writeVInt(delta * 2);
      } else {
        // the payload length is different from the previous one. We shift the DocSkip, 
        // set the lowest bit and store the current payload length as VInt.
        skipBuffer.writeVInt(delta * 2 + 1);
        skipBuffer.writeVInt(payloadLength);
        lastSkipPayloadLength = payloadLength;
      }
    } else {
      // current field does not store payloads
      skipBuffer.writeVInt(doc - lastSkipDoc);
    }
    skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer));
    skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer));
--- a/src/java/org/apache/lucene/index/SegmentReader.java
+++ b/src/java/org/apache/lucene/index/SegmentReader.java
@ -374,6 +374,9 @@ class SegmentReader extends IndexReader {
      else if (!fi.isIndexed && fieldOption == IndexReader.FieldOption.UNINDEXED) {
        fieldSet.add(fi.name);
      }
      else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) {
        fieldSet.add(fi.name);
      }
      else if (fi.isIndexed && fieldOption == IndexReader.FieldOption.INDEXED) {
        fieldSet.add(fi.name);
      }
@ -582,7 +585,12 @@ class SegmentReader extends IndexReader {
    return termVectorsReader.get(docNumber);
  }
-
+  
  /** Returns the field infos of this segment */
  FieldInfos fieldInfos() {
    return fieldInfos;
  }
  /**
   * Return the name of the segment this reader is reading.
   */
--- a/src/java/org/apache/lucene/index/SegmentTermDocs.java
+++ b/src/java/org/apache/lucene/index/SegmentTermDocs.java
@ -39,6 +39,9 @@ class SegmentTermDocs implements TermDocs {
  private long proxPointer;
  private long skipPointer;
  private boolean haveSkipped;
  private int payloadLengthAtLastSkip;
  protected boolean currentFieldStoresPayloads;
  protected SegmentTermDocs(SegmentReader parent) {
    this.parent = parent;
@ -49,23 +52,31 @@ class SegmentTermDocs implements TermDocs {
  public void seek(Term term) throws IOException {
    TermInfo ti = parent.tis.get(term);
-    seek(ti);
+    seek(ti, term);
  }
  public void seek(TermEnum termEnum) throws IOException {
    TermInfo ti;
    Term term;
    // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs
-    if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos)          // optimized case
+    if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos) {        // optimized case
-      ti = ((SegmentTermEnum) termEnum).termInfo();
+      SegmentTermEnum segmentTermEnum = ((SegmentTermEnum) termEnum);
-    else                                          // punt case
+      term = segmentTermEnum.term();
-      ti = parent.tis.get(termEnum.term());
+      ti = segmentTermEnum.termInfo();
-      
+    } else  {                                         // punt case
-    seek(ti);
+      term = termEnum.term();
      ti = parent.tis.get(term);        
    }
    seek(ti, term);
  }
-  void seek(TermInfo ti) throws IOException {
+  void seek(TermInfo ti, Term term) throws IOException {
    count = 0;
    payloadLengthAtLastSkip = 0;
    FieldInfo fi = parent.fieldInfos.fieldInfo(term.field);
    currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false;
    if (ti == null) {
      df = 0;
    } else {
@ -141,7 +152,7 @@ class SegmentTermDocs implements TermDocs {
  }
  /** Overridden by SegmentTermPositions to skip in prox stream. */
-  protected void skipProx(long proxPointer) throws IOException {}
+  protected void skipProx(long proxPointer, int payloadLength) throws IOException {}
  /** Optimized implementation. */
  public boolean skipTo(int target) throws IOException {
@ -157,6 +168,7 @@ class SegmentTermDocs implements TermDocs {
      // scan skip data
      int lastSkipDoc = skipDoc;
      int lastPayloadLength = 0;
      long lastFreqPointer = freqStream.getFilePointer();
      long lastProxPointer = -1;
      int numSkipped = -1 - (count % skipInterval);
@ -165,6 +177,7 @@ class SegmentTermDocs implements TermDocs {
        lastSkipDoc = skipDoc;
        lastFreqPointer = freqPointer;
        lastProxPointer = proxPointer;
        lastPayloadLength = payloadLengthAtLastSkip;
        if (skipDoc != 0 && skipDoc >= doc)
          numSkipped += skipInterval;
@ -172,7 +185,21 @@ class SegmentTermDocs implements TermDocs {
        if(skipCount >= numSkips)
          break;
-        skipDoc += skipStream.readVInt();
+        if (currentFieldStoresPayloads) {
          // the current field stores payloads.
          // if the doc delta is odd then we have
          // to read the current payload length
          // because it differs from the length of the
          // previous payload
          int delta = skipStream.readVInt();
          if ((delta & 1) != 0) {
            payloadLengthAtLastSkip = skipStream.readVInt();
          }
          delta >>>= 1;
          skipDoc += delta;
        } else {
          skipDoc += skipStream.readVInt();
        }
        freqPointer += skipStream.readVInt();
        proxPointer += skipStream.readVInt();
@ -182,7 +209,7 @@ class SegmentTermDocs implements TermDocs {
      // if we found something to skip, then skip it
      if (lastFreqPointer > freqStream.getFilePointer()) {
        freqStream.seek(lastFreqPointer);
-        skipProx(lastProxPointer);
+        skipProx(lastProxPointer, lastPayloadLength);
        doc = lastSkipDoc;
        count += numSkipped;
--- a/src/java/org/apache/lucene/index/SegmentTermPositions.java
+++ b/src/java/org/apache/lucene/index/SegmentTermPositions.java
@ -27,6 +27,12 @@ extends SegmentTermDocs implements TermPositions {
  private int proxCount;
  private int position;
  // the current payload length
  private int payloadLength;
  // indicates whether the payload of the currend position has
  // been read from the proxStream yet
  private boolean needToLoadPayload;
  // these variables are being used to remember information
  // for a lazy skip
  private long lazySkipPointer = 0;
@ -37,13 +43,15 @@ extends SegmentTermDocs implements TermPositions {
    this.proxStream = null;  // the proxStream will be cloned lazily when nextPosition() is called for the first time
  }
-  final void seek(TermInfo ti) throws IOException {
+  final void seek(TermInfo ti, Term term) throws IOException {
-    super.seek(ti);
+    super.seek(ti, term);
    if (ti != null)
      lazySkipPointer = ti.proxPointer;
    lazySkipProxCount = 0;
    proxCount = 0;
    payloadLength = 0;
    needToLoadPayload = false;
  }
  public final void close() throws IOException {
@ -55,9 +63,28 @@ extends SegmentTermDocs implements TermPositions {
    // perform lazy skips if neccessary
    lazySkip();
    proxCount--;
-    return position += proxStream.readVInt();
+    return position += readDeltaPosition();
  }
  private final int readDeltaPosition() throws IOException {
    int delta = proxStream.readVInt();
    if (currentFieldStoresPayloads) {
      // if the current field stores payloads then
      // the position delta is shifted one bit to the left.
      // if the LSB is set, then we have to read the current
      // payload length
      if ((delta & 1) != 0) {
        payloadLength = proxStream.readVInt();
      } 
      delta >>>= 1;
      needToLoadPayload = true;
    } else {
      payloadLength = 0;
      needToLoadPayload = false;
    }
    return delta;
  }
  protected final void skippingDoc() throws IOException {
    // we remember to skip a document lazily
    lazySkipProxCount += freq;
@ -82,16 +109,27 @@ extends SegmentTermDocs implements TermPositions {
  /** Called by super.skipTo(). */
-  protected void skipProx(long proxPointer) throws IOException {
+  protected void skipProx(long proxPointer, int payloadLength) throws IOException {
    // we save the pointer, we might have to skip there lazily
    lazySkipPointer = proxPointer;
    lazySkipProxCount = 0;
    proxCount = 0;
    this.payloadLength = payloadLength;
    needToLoadPayload = false;
  }
  private void skipPositions(int n) throws IOException {
-    for (int f = n; f > 0; f--)         // skip unread positions
+    for (int f = n; f > 0; f--) {        // skip unread positions
-      proxStream.readVInt();
+      readDeltaPosition();
      skipPayload();
    }      
  }
  private void skipPayload() throws IOException {
    if (needToLoadPayload && payloadLength > 0) {
      proxStream.seek(proxStream.getFilePointer() + payloadLength);
    }
    needToLoadPayload = false;
  }
  // It is not always neccessary to move the prox pointer
@ -109,6 +147,10 @@ extends SegmentTermDocs implements TermPositions {
      // clone lazily
      proxStream = (IndexInput)parent.proxStream.clone();
    }
    // we might have to skip the current payload
    // if it was not read yet
    skipPayload();
    if (lazySkipPointer != 0) {
      proxStream.seek(lazySkipPointer);
@ -120,5 +162,31 @@ extends SegmentTermDocs implements TermPositions {
      lazySkipProxCount = 0;
    }
  }
  public int getPayloadLength() {
    return payloadLength;
  }
  public byte[] getPayload(byte[] data, int offset) throws IOException {
    if (!needToLoadPayload) {
      throw new IOException("Payload cannot be loaded more than once for the same term position.");
    }
    // read payloads lazily
    byte[] retArray;
    int retOffset;
    if (data == null || data.length - offset < payloadLength) {
      // the array is too small to store the payload data,
      // so we allocate a new one
      retArray = new byte[payloadLength];
      retOffset = 0;
    } else {
      retArray = data;
      retOffset = offset;
    }
    proxStream.readBytes(retArray, retOffset, payloadLength);
    needToLoadPayload = false;
    return retArray;
  }
 }
--- a/src/java/org/apache/lucene/index/TermPositions.java
+++ b/src/java/org/apache/lucene/index/TermPositions.java
@ -32,10 +32,53 @@ public interface TermPositions
    extends TermDocs
 {
    /** Returns next position in the current document.  It is an error to call
-	this more than {@link #freq()} times
+    this more than {@link #freq()} times
-	without calling {@link #next()}<p> This is
+    without calling {@link #next()}<p> This is
-	invalid until {@link #next()} is called for
+    invalid until {@link #next()} is called for
-	the first time.
+    the first time.
    */
    int nextPosition() throws IOException;
    /** 
     * Returns the length of the payload at the current term position.
     * This is invalid until {@link #nextPosition()} is called for
     * the first time.<br>
     * <br>
     * <b>
     * Warning: The status of the Payloads feature is experimental. The APIs
     * introduced here might change in the future and will not be supported anymore
     * in such a case. If you want to use this feature in a production environment
     * you should wait for an official release.
     * </b> 
     * @return length of the current payload in number of bytes
     */
    // TODO: Remove warning after API has been finalized
    int getPayloadLength();
    /** 
     * Returns the payload data at the current term position.
     * This is invalid until {@link #nextPosition()} is called for
     * the first time.
     * This method must not be called more than once after each call
     * of {@link #nextPosition()}. However, payloads are loaded lazily,
     * so if the payload data for the current position is not needed,
     * this method may not be called at all for performance reasons.<br>
     * <br>
     * <b>
     * Warning: The status of the Payloads feature is experimental. The APIs
     * introduced here might change in the future and will not be supported anymore
     * in such a case. If you want to use this feature in a production environment
     * you should wait for an official release.
     * </b>
     * 
     * @param data the array into which the data of this payload is to be
     *             stored, if it is big enough; otherwise, a new byte[] array
     *             is allocated for this purpose. 
     * @param offset the offset in the array into which the data of this payload
     *               is to be stored.
     * @return a byte[] array containing the data of this payload
     * @throws IOException
     */
    // TODO: Remove warning after API has been finalized
    byte[] getPayload(byte[] data, int offset) throws IOException;
 }
--- a/src/java/org/apache/lucene/store/BufferedIndexOutput.java
+++ b/src/java/org/apache/lucene/store/BufferedIndexOutput.java
@ -24,8 +24,8 @@ public abstract class BufferedIndexOutput extends IndexOutput {
  static final int BUFFER_SIZE = 1024;
  private final byte[] buffer = new byte[BUFFER_SIZE];
-  private long bufferStart = 0;			  // position in file of buffer
+  private long bufferStart = 0;           // position in file of buffer
-  private int bufferPosition = 0;		  // position in buffer
+  private int bufferPosition = 0;         // position in buffer
  /** Writes a single byte.
   * @see IndexInput#readByte()
@ -41,12 +41,12 @@ public abstract class BufferedIndexOutput extends IndexOutput {
   * @param length the number of bytes to write
   * @see IndexInput#readBytes(byte[],int,int)
   */
-  public void writeBytes(byte[] b, int length) throws IOException {
+  public void writeBytes(byte[] b, int offset, int length) throws IOException {
    int bytesLeft = BUFFER_SIZE - bufferPosition;
    // is there enough space in the buffer?
    if (bytesLeft >= length) {
      // we add the data to the end of the buffer
-      System.arraycopy(b, 0, buffer, bufferPosition, length);
+      System.arraycopy(b, offset, buffer, bufferPosition, length);
      bufferPosition += length;
      // if the buffer is full, flush it
      if (BUFFER_SIZE - bufferPosition == 0)
@ -58,7 +58,7 @@ public abstract class BufferedIndexOutput extends IndexOutput {
        if (bufferPosition > 0)
          flush();
        // and write data at once
-        flushBuffer(b, length);
+        flushBuffer(b, offset, length);
        bufferStart += length;
      } else {
        // we fill/flush the buffer (until the input is written)
@ -66,7 +66,7 @@ public abstract class BufferedIndexOutput extends IndexOutput {
        int pieceLength;
        while (pos < length) {
          pieceLength = (length - pos < bytesLeft) ? length - pos : bytesLeft;
-          System.arraycopy(b, pos, buffer, bufferPosition, pieceLength);
+          System.arraycopy(b, pos + offset, buffer, bufferPosition, pieceLength);
          pos += pieceLength;
          bufferPosition += pieceLength;
          // if the buffer is full, flush it
@ -92,8 +92,18 @@ public abstract class BufferedIndexOutput extends IndexOutput {
   * @param b the bytes to write
   * @param len the number of bytes to write
   */
-  protected abstract void flushBuffer(byte[] b, int len) throws IOException;
+  private void flushBuffer(byte[] b, int len) throws IOException {
    flushBuffer(b, 0, len);
  }
  /** Expert: implements buffer write.  Writes bytes at the current position in
   * the output.
   * @param b the bytes to write
   * @param offset the offset in the byte array
   * @param len the number of bytes to write
   */
  protected abstract void flushBuffer(byte[] b, int offset, int len) throws IOException;
  /** Closes this stream to further operations. */
  public void close() throws IOException {
    flush();
--- a/src/java/org/apache/lucene/store/FSDirectory.java
+++ b/src/java/org/apache/lucene/store/FSDirectory.java
@ -588,8 +588,8 @@ class FSIndexOutput extends BufferedIndexOutput {
  }
  /** output methods: */
-  public void flushBuffer(byte[] b, int size) throws IOException {
+  public void flushBuffer(byte[] b, int offset, int size) throws IOException {
-    file.write(b, 0, size);
+    file.write(b, offset, size);
  }
  public void close() throws IOException {
    // only close the file if it has not been closed yet
--- a/src/java/org/apache/lucene/store/IndexOutput.java
+++ b/src/java/org/apache/lucene/store/IndexOutput.java
@ -36,7 +36,17 @@ public abstract class IndexOutput {
   * @param length the number of bytes to write
   * @see IndexInput#readBytes(byte[],int,int)
   */
-  public abstract void writeBytes(byte[] b, int length) throws IOException;
+  public void writeBytes(byte[] b, int length) throws IOException {
    writeBytes(b, 0, length);
  }
  /** Writes an array of bytes.
   * @param b the bytes to write
   * @param offset the offset in the byte array
   * @param length the number of bytes to write
   * @see IndexInput#readBytes(byte[],int,int)
   */
  public abstract void writeBytes(byte[] b, int offset, int length) throws IOException;
  /** Writes an int as four bytes.
   * @see IndexInput#readInt()
--- a/src/java/org/apache/lucene/store/RAMOutputStream.java
+++ b/src/java/org/apache/lucene/store/RAMOutputStream.java
@ -66,7 +66,7 @@ public class RAMOutputStream extends BufferedIndexOutput {
    file.setLength(0);
  }
-  public void flushBuffer(byte[] src, int len) throws IOException {
+  public void flushBuffer(byte[] src, int offset, int len) throws IOException {
    byte[] buffer;
    int bufferPos = 0;
    while (bufferPos != len) {
@ -81,7 +81,7 @@ public class RAMOutputStream extends BufferedIndexOutput {
      else
        buffer = (byte[]) file.buffers.get(bufferNumber);
-      System.arraycopy(src, bufferPos, buffer, bufferOffset, bytesToCopy);
+      System.arraycopy(src, offset + bufferPos, buffer, bufferOffset, bytesToCopy);
      bufferPos += bytesToCopy;
      pointer += bytesToCopy;
    }
--- a/src/site/src/documentation/content/xdocs/fileformats.xml
+++ b/src/site/src/documentation/content/xdocs/fileformats.xml
@ -1013,6 +1013,7 @@
                        <li>If the third lowest-order bit is set (0x04), term positions are stored with the term vectors.</li>
                        <li>If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors.</li>
                        <li>If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field.</li>
                        <li>If the sixth lowest-order bit is set (0x20), payloads are stored for the indexed field.</li>
                    </ul>
                </p>
@ -1298,9 +1299,9 @@
                    <sup>DocFreq/SkipInterval</sup>
                </p>
                <p>SkipDatum --&gt;
-                    DocSkip,FreqSkip,ProxSkip
+                    DocSkip,PayloadLength?,FreqSkip,ProxSkip
                </p>
-                <p>DocDelta,Freq,DocSkip,FreqSkip,ProxSkip --&gt;
+                <p>DocDelta,Freq,DocSkip,PayloadLength,FreqSkip,ProxSkip --&gt;
                    VInt
                </p>
                <p>TermFreqs
@ -1328,9 +1329,17 @@
                    SkipInterval
                    <sup>th</sup>
                    document in TermFreqs.
-                    Document numbers are represented as differences
+                    If payloads are disabled for the term's field,
-                    from the previous value in the sequence. FreqSkip
+                    then DocSkip represents the difference from the
-                    and ProxSkip record the position of every
+                    previous value in the sequence.
                    If payloads are enabled for the term's field, 
                    then DocSkip/2 represents the difference from the
                    previous value in the sequence. If payloads are enabled
                    and DocSkip is odd,
                    then PayloadLength is stored indicating the length 
                    of the last payload before the SkipInterval<sup>th</sup>
                    document in TermPositions.
 					FreqSkip and ProxSkip record the position of every
                    SkipInterval
                    <sup>th</sup>
                    entry in FreqFile and
@ -1379,12 +1388,21 @@
                    <sup>DocFreq</sup>
                </p>
                <p>Positions --&gt;
-                    &lt;PositionDelta&gt;
+                    &lt;PositionDelta,Payload?&gt;
                    <sup>Freq</sup>
                </p>
                <p>Payload --&gt;
                    &lt;PayloadLength?,PayloadData&gt;
                </p>
                <p>PositionDelta --&gt;
                    VInt
                </p>
                <p>PayloadLength --&gt;
                    VInt
                </p>
                <p>PayloadData --&gt;
                    byte<sup>PayloadLength</sup>
                </p>
                <p>TermPositions
                    are ordered by term (the term is implicit, from the .tis file).
                </p>
@ -1393,19 +1411,30 @@
                    number is implicit from the .frq file).
                </p>
                <p>PositionDelta
-                    is the difference between the position of the current occurrence in
+                    is, if payloads are disabled for the term's field, the difference 
                    between the position of the current occurrence in
                    the document and the previous occurrence (or zero, if this is the
                    first occurrence in this document).
                    If payloads are enabled for the term's field, then PositionDelta/2
                    is the difference between the current and the previous position. If
                    payloads are enabled and PositionDelta is odd, then PayloadLength is 
                    stored, indicating the length of the payload at the current term position.
                </p>
                <p>
                    For example, the TermPositions for a
                    term which occurs as the fourth term in one document, and as the
                    fifth and ninth term in a subsequent document, would be the following
-                    sequence of VInts:
+                    sequence of VInts (payloads disabled):
                </p>
                <p>4,
                    5, 4
                </p>
                <p>PayloadData
                    is metadata associated with the current term position. If PayloadLength
                    is stored at the current position, then it indicates the length of this 
                    Payload. If PayloadLength is not stored, then this Payload has the same
                    length as the Payload at the previous position.
                </p>
            </section>
            <section id="Normalization Factors"><title>Normalization Factors</title>
 				<p>
--- a/src/test/org/apache/lucene/index/TestPayloads.java
+++ b/src/test/org/apache/lucene/index/TestPayloads.java
@ -0,0 +1,443 @@
 package org.apache.lucene.index;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.File;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Random;
 import junit.framework.TestCase;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.RAMDirectory;
 public class TestPayloads extends TestCase {
    // Simple tests to test the Payload class
    public void testPayload() throws Exception {
        byte[] testData = "This is a test!".getBytes();
        Payload payload = new Payload(testData);
        assertEquals("Wrong payload length.", testData.length, payload.length());
        // test copyTo()
        byte[] target = new byte[testData.length - 1];
        try {
            payload.copyTo(target, 0);
            fail("Expected exception not thrown");
        } catch (Exception expected) {
            // expected exception
        }
        target = new byte[testData.length + 3];
        payload.copyTo(target, 3);
        for (int i = 0; i < testData.length; i++) {
            assertEquals(testData[i], target[i + 3]);
        }
        // test toByteArray()
        target = payload.toByteArray();
        assertByteArrayEquals(testData, target);
        // test byteAt()
        for (int i = 0; i < testData.length; i++) {
            assertEquals(payload.byteAt(i), testData[i]);
        }
        try {
            payload.byteAt(testData.length + 1);
            fail("Expected exception not thrown");
        } catch (Exception expected) {
            // expected exception
        }
    }
    // Tests whether the DocumentWriter and SegmentMerger correctly enable the
    // payload bit in the FieldInfo
    public void testPayloadFieldBit() throws Exception {
        Directory ram = new RAMDirectory();
        PayloadAnalyzer analyzer = new PayloadAnalyzer();
        IndexWriter writer = new IndexWriter(ram, analyzer, true);
        Document d = new Document();
        // this field won't have any payloads
        d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
        // this field will have payloads in all docs, however not for all term positions,
        // so this field is used to check if the DocumentWriter correctly enables the payloads bit
        // even if only some term positions have payloads
        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
        // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads 
        // enabled in only some documents
        d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
        // only add payload data for field f2
        analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
        writer.addDocument(d);
        // flush
        writer.close();        
        // only one segment in the index, so we can cast to SegmentReader
        SegmentReader reader = (SegmentReader) IndexReader.open(ram);
        FieldInfos fi = reader.fieldInfos();
        assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
        assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
        assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
        reader.close();
        // now we add another document which has payloads for field f3 and verify if the SegmentMerger
        // enabled payloads for that field
        writer = new IndexWriter(ram, analyzer, true);
        d = new Document();
        d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
        d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
        // add payload data for field f2 and f3
        analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
        analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
        writer.addDocument(d);
        // force merge
        writer.optimize();
        // flush
        writer.close();
        // only one segment in the index, so we can cast to SegmentReader
        reader = (SegmentReader) IndexReader.open(ram);
        fi = reader.fieldInfos();
        assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
        assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
        assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
        reader.close();        
    }
    // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
    public void testPayloadsEncoding() throws Exception {
        // first perform the test using a RAMDirectory
        Directory dir = new RAMDirectory();
        performTest(dir);
        // now use a FSDirectory and repeat same test
        String dirName = "test_payloads"; 
        dir = FSDirectory.getDirectory(dirName);
        performTest(dir);
        rmDir(dirName);
    }
    // builds an index with payloads in the given Directory and performs
    // different tests to verify the payload encoding
    private void performTest(Directory dir) throws Exception {
        PayloadAnalyzer analyzer = new PayloadAnalyzer();
        IndexWriter writer = new IndexWriter(dir, analyzer, true);
        // should be in sync with value in TermInfosWriter
        final int skipInterval = 16;
        final int numTerms = 5;
        final String fieldName = "f1";
        int numDocs = skipInterval + 1; 
        // create content for the test documents with just a few terms
        Term[] terms = generateTerms(fieldName, numTerms);
        StringBuffer sb = new StringBuffer();
        for (int i = 0; i < terms.length; i++) {
            sb.append(terms[i].text);
            sb.append(" ");
        }
        String content = sb.toString();
        int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
        byte[] payloadData = generateRandomData(payloadDataLength);
        Document d = new Document();
        d.add(new Field(fieldName, content, Field.Store.NO, Field.Index.TOKENIZED));
        // add the same document multiple times to have the same payload lengths for all
        // occurrences within two consecutive skip intervals
        int offset = 0;
        for (int i = 0; i < 2 * numDocs; i++) {
            analyzer.setPayloadData(fieldName, payloadData, offset, 1);
            offset += numTerms;
            writer.addDocument(d);
        }
        // now we make sure to have different payload lengths next at the next skip point        
        for (int i = 0; i < numDocs; i++) {
            analyzer.setPayloadData(fieldName, payloadData, offset, i);
            offset += i * numTerms;
            writer.addDocument(d);
        }
        writer.optimize();
        // flush
        writer.close();
        /*
         * Verify the index
         * first we test if all payloads are stored correctly
         */        
        IndexReader reader = IndexReader.open(dir);
        byte[] verifyPayloadData = new byte[payloadDataLength];
        offset = 0;
        TermPositions[] tps = new TermPositions[numTerms];
        for (int i = 0; i < numTerms; i++) {
            tps[i] = reader.termPositions(terms[i]);
        }
        while (tps[0].next()) {
            for (int i = 1; i < numTerms; i++) {
                tps[i].next();
            }
            int freq = tps[0].freq();
            for (int i = 0; i < freq; i++) {
                for (int j = 0; j < numTerms; j++) {
                    tps[j].nextPosition();
                    tps[j].getPayload(verifyPayloadData, offset);
                    offset += tps[j].getPayloadLength();
                }
            }
        }
        for (int i = 0; i < numTerms; i++) {
            tps[i].close();
        }
        assertByteArrayEquals(payloadData, verifyPayloadData);
        /*
         *  test lazy skipping
         */        
        TermPositions tp = reader.termPositions(terms[0]);
        tp.next();
        tp.nextPosition();
        // now we don't read this payload
        tp.nextPosition();
        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
        byte[] payload = tp.getPayload(null, 0);
        assertEquals(payload[0], payloadData[numTerms]);
        tp.nextPosition();
        // we don't read this payload and skip to a different document
        tp.skipTo(5);
        tp.nextPosition();
        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
        payload = tp.getPayload(null, 0);
        assertEquals(payload[0], payloadData[5 * numTerms]);
        /*
         * Test different lengths at skip points
         */
        tp.seek(terms[1]);
        tp.next();
        tp.nextPosition();
        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
        tp.skipTo(skipInterval - 1);
        tp.nextPosition();
        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
        tp.skipTo(2 * skipInterval - 1);
        tp.nextPosition();
        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
        tp.skipTo(3 * skipInterval - 1);
        tp.nextPosition();
        assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
        /*
         * Test multiple call of getPayload()
         */
        tp.getPayload(null, 0);
        try {
            // it is forbidden to call getPayload() more than once
            // without calling nextPosition()
            tp.getPayload(null, 0);
            fail("Expected exception not thrown");
        } catch (Exception expected) {
            // expected exception
        }
        reader.close();
        // test long payload
        analyzer = new PayloadAnalyzer();
        writer = new IndexWriter(dir, analyzer, true);
        String singleTerm = "lucene";
        d = new Document();
        d.add(new Field(fieldName, singleTerm, Field.Store.NO, Field.Index.TOKENIZED));
        // add a payload whose length is greater than the buffer size of BufferedIndexOutput
        payloadData = generateRandomData(2000);
        analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
        writer.addDocument(d);
        writer.optimize();
        // flush
        writer.close();
        reader = IndexReader.open(dir);
        tp = reader.termPositions(new Term(fieldName, singleTerm));
        tp.next();
        tp.nextPosition();
        verifyPayloadData = new byte[tp.getPayloadLength()];
        tp.getPayload(verifyPayloadData, 0);
        byte[] portion = new byte[1500];
        System.arraycopy(payloadData, 100, portion, 0, 1500);
        assertByteArrayEquals(portion, verifyPayloadData);
        reader.close();
    }
    private byte[] generateRandomData(int n) {
        Random rnd = new Random();
        byte[] data = new byte[n];
        rnd.nextBytes(data);
        return data;
    }
    private Term[] generateTerms(String fieldName, int n) {
        int maxDigits = (int) (Math.log(n) / Math.log(10));
        Term[] terms = new Term[n];
        StringBuffer sb = new StringBuffer();
        for (int i = 0; i < n; i++) {
            sb.setLength(0);
            sb.append("t");
            int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
            for (int j = 0; j < zeros; j++) {
                sb.append("0");
            }
            sb.append(i);
            terms[i] = new Term(fieldName, sb.toString());
        }
        return terms;
    }
    private void rmDir(String dir) {
        File fileDir = new File(dir);
        if (fileDir.exists()) {
          File[] files = fileDir.listFiles();
          if (files != null) {
            for (int i = 0; i < files.length; i++) {
              files[i].delete();
            }
          }
          fileDir.delete();
        }
      }
    void assertByteArrayEquals(byte[] b1, byte[] b2) {
        if (b1.length != b2.length) {
          fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
        }
        for (int i = 0; i < b1.length; i++) {
          if (b1[i] != b2[i]) {
            fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
          }
        }
      }    
    /**
     * This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
     */
    private static class PayloadAnalyzer extends Analyzer {
        Map fieldToData = new HashMap();
        void setPayloadData(String field, byte[] data, int offset, int length) {
            fieldToData.put(field, new PayloadData(0, data, offset, length));
        }
        void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
            fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
        }
        public TokenStream tokenStream(String fieldName, Reader reader) {
            PayloadData payload = (PayloadData) fieldToData.get(fieldName);
            TokenStream ts = new WhitespaceTokenizer(reader);
            if (payload != null) {
                if (payload.numFieldInstancesToSkip == 0) {
                    ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
                } else {
                    payload.numFieldInstancesToSkip--;
                }
            }
            return ts;
        }
        private static class PayloadData {
            byte[] data;
            int offset;
            int length;
            int numFieldInstancesToSkip;
            PayloadData(int skip, byte[] data, int offset, int length) {
                numFieldInstancesToSkip = skip;
                this.data = data;
                this.offset = offset;
                this.length = length;
            }
        }
    }
    /**
     * This Filter adds payloads to the tokens.
     */
    private static class PayloadFilter extends TokenFilter {
        private byte[] data;
        private int length;
        private int offset;
        public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
            super(in);
            this.data = data;
            this.length = length;
            this.offset = offset;
        }
        public Token next() throws IOException {
            Token nextToken = input.next();
            if (nextToken != null && offset + length <= data.length) {
              nextToken.setPayload(new Payload(data, offset, length));
              offset += length;
            }            
            return nextToken;
        }
      }
 }
--- a/src/test/org/apache/lucene/store/MockRAMOutputStream.java
+++ b/src/test/org/apache/lucene/store/MockRAMOutputStream.java
@ -48,7 +48,7 @@ public class MockRAMOutputStream extends RAMOutputStream {
    }
  }
-  public void flushBuffer(byte[] src, int len) throws IOException {
+  public void flushBuffer(byte[] src, int offset, int len) throws IOException {
    long freeSpace = dir.maxSize - dir.sizeInBytes();
    long realUsage = 0;
@ -63,14 +63,14 @@ public class MockRAMOutputStream extends RAMOutputStream {
    if (dir.maxSize != 0 && freeSpace <= len) {
      if (freeSpace > 0 && freeSpace < len) {
        realUsage += freeSpace;
-        super.flushBuffer(src, (int) freeSpace);
+        super.flushBuffer(src, offset, (int) freeSpace);
      }
      if (realUsage > dir.maxUsedSize) {
        dir.maxUsedSize = realUsage;
      }
      throw new IOException("fake disk full at " + dir.getRecomputedActualSizeInBytes() + " bytes");
    } else {
-      super.flushBuffer(src, len);
+      super.flushBuffer(src, offset, len);
    }
    if (first) {