From bd85dca2afcc4f493c4a033fffab77c56d01b68e Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Sat, 11 Jan 2014 19:23:13 +0000
Subject: [PATCH] LUCENE-5394: Fix TokenSources.getTokenStream to return
 payloads if they were indexed with term vectors

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1557439 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                            |  3 +
 .../lucene/search/highlight/TokenSources.java | 34 +++++++--
 .../TokenStreamFromTermPositionVector.java    | 13 ++++
 .../search/highlight/TokenSourcesTest.java    | 75 +++++++++++++++++--
 4 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index bb7b7636094..dddd47a616b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -153,6 +153,9 @@ Bug fixes
   domain-only URLs that are followed by an alphanumeric character.
   (Chris Geeringh, Steve Rowe)
 
+* LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if
+  they were indexed with the term vectors. (Mike McCandless)
+
 API Changes
 
 * LUCENE-5339: The facet module was simplified/reworked to make the
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
index e0b3c8c95ea..14a8f7f5239 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
@@ -20,11 +20,16 @@ package org.apache.lucene.search.highlight;
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.Fields;
@@ -35,10 +40,6 @@ import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Comparator;
-
 /**
  * Hides implementation issues associated with obtaining a TokenStream for use
  * with the higlighter - can obtain from TermFreqVectors with offsets and
@@ -169,11 +170,14 @@ public class TokenSources {
 
       PositionIncrementAttribute posincAtt;
 
+      PayloadAttribute payloadAtt;
+
       StoredTokenStream(Token tokens[]) {
         this.tokens = tokens;
         termAtt = addAttribute(CharTermAttribute.class);
         offsetAtt = addAttribute(OffsetAttribute.class);
         posincAtt = addAttribute(PositionIncrementAttribute.class);
+        payloadAtt = addAttribute(PayloadAttribute.class);
       }
 
       @Override
@@ -185,6 +189,10 @@ public class TokenSources {
         clearAttributes();
         termAtt.setEmpty().append(token);
         offsetAtt.setOffset(token.startOffset(), token.endOffset());
+        BytesRef payload = token.getPayload();
+        if (payload != null) {
+          payloadAtt.setPayload(payload);
+        }
         posincAtt
             .setPositionIncrement(currentToken <= 1
                 || tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
@@ -192,6 +200,9 @@ public class TokenSources {
         return true;
       }
     }
+
+    boolean hasPayloads = tpv.hasPayloads();
+
     // code to reconstruct the original sequence of Tokens
     TermsEnum termsEnum = tpv.iterator(null);
     int totalTokens = 0;
@@ -223,6 +234,13 @@ public class TokenSources {
         final Token token = new Token(term,
                                       dpEnum.startOffset(),
                                       dpEnum.endOffset());
+        if (hasPayloads) {
+          // Must make a deep copy of the returned payload,
+          // since D&PEnum API is allowed to re-use on every
+          // call:
+          token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
+        }
+
         if (tokenPositionsGuaranteedContiguous && pos != -1) {
           // We have positions stored and a guarantee that the token position
           // information is contiguous
@@ -253,9 +271,11 @@ public class TokenSources {
       ArrayUtil.timSort(tokensInOriginalOrder, new Comparator<Token>() {
         @Override
         public int compare(Token t1, Token t2) {
-          if (t1.startOffset() == t2.startOffset()) return t1.endOffset()
-              - t2.endOffset();
-          else return t1.startOffset() - t2.startOffset();
+          if (t1.startOffset() == t2.startOffset()) {
+            return t1.endOffset() - t2.endOffset();
+          } else {
+            return t1.startOffset() - t2.startOffset();
+          }
         }
       });
     }
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
index 4057bd96950..42db712998e 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.Terms;
@@ -48,6 +49,8 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
 
   private OffsetAttribute offsetAttribute;
 
+  private PayloadAttribute payloadAttribute;
+
   /**
    * Constructor.
    * 
@@ -59,7 +62,9 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
     termAttribute = addAttribute(CharTermAttribute.class);
     positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
     offsetAttribute = addAttribute(OffsetAttribute.class);
+    payloadAttribute = addAttribute(PayloadAttribute.class);
     final boolean hasOffsets = vector.hasOffsets();
+    final boolean hasPayloads = vector.hasPayloads();
     final TermsEnum termsEnum = vector.iterator(null);
     BytesRef text;
     DocsAndPositionsEnum dpEnum = null;
@@ -79,6 +84,13 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
           token = new Token();
           token.setEmpty().append(text.utf8ToString());
         }
+        if (hasPayloads) {
+          // Must make a deep copy of the returned payload,
+          // since D&PEnum API is allowed to re-use on every
+          // call:
+          token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
+        }
+
         // Yes - this is the position, not the increment! This is for
         // sorting. This value
         // will be corrected before use.
@@ -112,6 +124,7 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
       positionIncrementAttribute.setPositionIncrement(next
           .getPositionIncrement());
       offsetAttribute.setOffset(next.startOffset(), next.endOffset());
+      payloadAttribute.setPayload(next.getPayload());
       return true;
     }
     return false;
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
index 00cc34f6356..0fc5fe31534 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
@@ -17,10 +17,14 @@ package org.apache.lucene.search.highlight;
  * limitations under the License.
  */
 
+import java.io.IOException;
+
+import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@@ -29,6 +33,7 @@ import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.DisjunctionMaxQuery;
 import org.apache.lucene.search.IndexSearcher;
@@ -38,10 +43,9 @@ import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 
-import java.io.IOException;
-
 // LUCENE-2874
 public class TokenSourcesTest extends LuceneTestCase {
   private static final String FIELD = "text";
@@ -262,7 +266,6 @@ public class TokenSourcesTest extends LuceneTestCase {
 
   public void testTermVectorWithoutOffsetsThrowsException()
       throws IOException, InvalidTokenOffsetsException {
-    final String TEXT = "the fox did not jump";
     final Directory directory = newDirectory();
     final IndexWriter indexWriter = new IndexWriter(directory,
         newIndexWriterConfig(TEST_VERSION_CURRENT, null));
@@ -280,8 +283,7 @@ public class TokenSourcesTest extends LuceneTestCase {
     final IndexReader indexReader = DirectoryReader.open(directory);
     try {
       assertEquals(1, indexReader.numDocs());
-      final TokenStream tokenStream = TokenSources
-          .getTokenStream(
+      TokenSources.getTokenStream(
               indexReader.getTermVector(0, FIELD),
               false);
       fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
@@ -295,5 +297,68 @@ public class TokenSourcesTest extends LuceneTestCase {
     }
   }
 
+  int curOffset;
 
+  /** Just make a token with the text, and set the payload
+   *  to the text as well.  Offets increment "naturally". */
+  private Token getToken(String text) {
+    Token t = new Token(text, curOffset, curOffset+text.length());
+    t.setPayload(new BytesRef(text));
+    curOffset++;
+    return t;
+  }
+
+  // LUCENE-5294
+  public void testPayloads() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+    FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    myFieldType.setStoreTermVectors(true);
+    myFieldType.setStoreTermVectorOffsets(true);
+    myFieldType.setStoreTermVectorPositions(true);
+    myFieldType.setStoreTermVectorPayloads(true);
+
+    curOffset = 0;
+
+    Token[] tokens = new Token[] {
+      getToken("foxes"),
+      getToken("can"),
+      getToken("jump"),
+      getToken("high")
+    };
+
+    Document doc = new Document();
+    doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
+    writer.addDocument(doc);
+  
+    IndexReader reader = writer.getReader();
+    writer.close();
+    assertEquals(1, reader.numDocs());
+
+    for(int i=0;i<2;i++) {
+      // Do this twice, once passing true and then passing
+      // false: they are entirely different code paths
+      // under-the-hood:
+      TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), i == 0);
+
+      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+      PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
+      OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
+      PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
+
+      for(Token token : tokens) {
+        assertTrue(ts.incrementToken());
+        assertEquals(token.toString(), termAtt.toString());
+        assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
+        assertEquals(token.getPayload(), payloadAtt.getPayload());
+        assertEquals(token.startOffset(), offsetAtt.startOffset());
+        assertEquals(token.endOffset(), offsetAtt.endOffset());
+      }
+
+      assertFalse(ts.incrementToken());
+    }
+
+    reader.close();
+    dir.close();
+  }
 }