LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if they were indexed with term vectors

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1557439 13f79535-47bb-0310-9956-ffa450edef68
2025-02-13 21:45:39 +00:00 · 2014-01-11 19:23:13 +00:00 · 2014-01-11 19:23:13 +00:00 · bd85dca2af
commit bd85dca2af
parent 13084e4e81
4 changed files with 113 additions and 12 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -153,6 +153,9 @@ Bug fixes
  domain-only URLs that are followed by an alphanumeric character.
  (Chris Geeringh, Steve Rowe)

+* LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if
+  they were indexed with the term vectors. (Mike McCandless)
+
 API Changes

 * LUCENE-5339: The facet module was simplified/reworked to make the
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
@ -20,11 +20,16 @@ package org.apache.lucene.search.highlight;
 * limitations under the License.
 */

+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.Fields;
@ -35,10 +40,6 @@ import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;

-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Comparator;
-
 /**
 * Hides implementation issues associated with obtaining a TokenStream for use
 * with the higlighter - can obtain from TermFreqVectors with offsets and
@ -169,11 +170,14 @@ public class TokenSources {

      PositionIncrementAttribute posincAtt;

+      PayloadAttribute payloadAtt;
+
      StoredTokenStream(Token tokens[]) {
        this.tokens = tokens;
        termAtt = addAttribute(CharTermAttribute.class);
        offsetAtt = addAttribute(OffsetAttribute.class);
        posincAtt = addAttribute(PositionIncrementAttribute.class);
+        payloadAtt = addAttribute(PayloadAttribute.class);
      }

      @Override
@ -185,6 +189,10 @@ public class TokenSources {
        clearAttributes();
        termAtt.setEmpty().append(token);
        offsetAtt.setOffset(token.startOffset(), token.endOffset());
+        BytesRef payload = token.getPayload();
+        if (payload != null) {
+          payloadAtt.setPayload(payload);
+        }
        posincAtt
            .setPositionIncrement(currentToken <= 1
                || tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
@ -192,6 +200,9 @@ public class TokenSources {
        return true;
      }
    }
+
+    boolean hasPayloads = tpv.hasPayloads();
+
    // code to reconstruct the original sequence of Tokens
    TermsEnum termsEnum = tpv.iterator(null);
    int totalTokens = 0;
@ -223,6 +234,13 @@ public class TokenSources {
        final Token token = new Token(term,
                                      dpEnum.startOffset(),
                                      dpEnum.endOffset());
+        if (hasPayloads) {
+          // Must make a deep copy of the returned payload,
+          // since D&PEnum API is allowed to re-use on every
+          // call:
+          token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
+        }
+
        if (tokenPositionsGuaranteedContiguous && pos != -1) {
          // We have positions stored and a guarantee that the token position
          // information is contiguous
@ -253,9 +271,11 @@ public class TokenSources {
      ArrayUtil.timSort(tokensInOriginalOrder, new Comparator<Token>() {
        @Override
        public int compare(Token t1, Token t2) {
-          if (t1.startOffset() == t2.startOffset()) return t1.endOffset()
-              - t2.endOffset();
-          else return t1.startOffset() - t2.startOffset();
+          if (t1.startOffset() == t2.startOffset()) {
+            return t1.endOffset() - t2.endOffset();
+          } else {
+            return t1.startOffset() - t2.startOffset();
+          }
        }
      });
    }
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.Terms;
@ -48,6 +49,8 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {

  private OffsetAttribute offsetAttribute;

+  private PayloadAttribute payloadAttribute;
+
  /**
   * Constructor.
   * 
@ -59,7 +62,9 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
    termAttribute = addAttribute(CharTermAttribute.class);
    positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
    offsetAttribute = addAttribute(OffsetAttribute.class);
+    payloadAttribute = addAttribute(PayloadAttribute.class);
    final boolean hasOffsets = vector.hasOffsets();
+    final boolean hasPayloads = vector.hasPayloads();
    final TermsEnum termsEnum = vector.iterator(null);
    BytesRef text;
    DocsAndPositionsEnum dpEnum = null;
@ -79,6 +84,13 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
          token = new Token();
          token.setEmpty().append(text.utf8ToString());
        }
+        if (hasPayloads) {
+          // Must make a deep copy of the returned payload,
+          // since D&PEnum API is allowed to re-use on every
+          // call:
+          token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
+        }
+
        // Yes - this is the position, not the increment! This is for
        // sorting. This value
        // will be corrected before use.
@ -112,6 +124,7 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
      positionIncrementAttribute.setPositionIncrement(next
          .getPositionIncrement());
      offsetAttribute.setOffset(next.startOffset(), next.endOffset());
+      payloadAttribute.setPayload(next.getPayload());
      return true;
    }
    return false;
--- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
@ -17,10 +17,14 @@ package org.apache.lucene.search.highlight;
 * limitations under the License.
 */

+import java.io.IOException;
+
+import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@ -29,6 +33,7 @@ import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.DisjunctionMaxQuery;
 import org.apache.lucene.search.IndexSearcher;
@ -38,10 +43,9 @@ import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;

-import java.io.IOException;
-
 // LUCENE-2874
 public class TokenSourcesTest extends LuceneTestCase {
  private static final String FIELD = "text";
@ -262,7 +266,6 @@ public class TokenSourcesTest extends LuceneTestCase {

  public void testTermVectorWithoutOffsetsThrowsException()
      throws IOException, InvalidTokenOffsetsException {
-    final String TEXT = "the fox did not jump";
    final Directory directory = newDirectory();
    final IndexWriter indexWriter = new IndexWriter(directory,
        newIndexWriterConfig(TEST_VERSION_CURRENT, null));
@ -280,8 +283,7 @@ public class TokenSourcesTest extends LuceneTestCase {
    final IndexReader indexReader = DirectoryReader.open(directory);
    try {
      assertEquals(1, indexReader.numDocs());
-      final TokenStream tokenStream = TokenSources
-          .getTokenStream(
+      TokenSources.getTokenStream(
              indexReader.getTermVector(0, FIELD),
              false);
      fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
@ -295,5 +297,68 @@ public class TokenSourcesTest extends LuceneTestCase {
    }
  }

+  int curOffset;

+  /** Just make a token with the text, and set the payload
+   *  to the text as well.  Offets increment "naturally". */
+  private Token getToken(String text) {
+    Token t = new Token(text, curOffset, curOffset+text.length());
+    t.setPayload(new BytesRef(text));
+    curOffset++;
+    return t;
+  }
+
+  // LUCENE-5294
+  public void testPayloads() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+    FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
+    myFieldType.setStoreTermVectors(true);
+    myFieldType.setStoreTermVectorOffsets(true);
+    myFieldType.setStoreTermVectorPositions(true);
+    myFieldType.setStoreTermVectorPayloads(true);
+
+    curOffset = 0;
+
+    Token[] tokens = new Token[] {
+      getToken("foxes"),
+      getToken("can"),
+      getToken("jump"),
+      getToken("high")
+    };
+
+    Document doc = new Document();
+    doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
+    writer.addDocument(doc);
+  
+    IndexReader reader = writer.getReader();
+    writer.close();
+    assertEquals(1, reader.numDocs());
+
+    for(int i=0;i<2;i++) {
+      // Do this twice, once passing true and then passing
+      // false: they are entirely different code paths
+      // under-the-hood:
+      TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), i == 0);
+
+      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+      PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
+      OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
+      PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
+
+      for(Token token : tokens) {
+        assertTrue(ts.incrementToken());
+        assertEquals(token.toString(), termAtt.toString());
+        assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
+        assertEquals(token.getPayload(), payloadAtt.getPayload());
+        assertEquals(token.startOffset(), offsetAtt.startOffset());
+        assertEquals(token.endOffset(), offsetAtt.endOffset());
+      }
+
+      assertFalse(ts.incrementToken());
+    }
+
+    reader.close();
+    dir.close();
+  }
 }