LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if they were indexed with term vectors

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1557439 13f79535-47bb-0310-9956-ffa450edef68
2014-01-11 19:23:13 +00:00 · 2014-01-11 19:23:13 +00:00 · bd85dca2af
parent 13084e4e81
commit bd85dca2af
4 changed files with 113 additions and 12 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -153,6 +153,9 @@ Bug fixes
  domain-only URLs that are followed by an alphanumeric character.
  (Chris Geeringh, Steve Rowe)
 * LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if
  they were indexed with the term vectors. (Mike McCandless)
 API Changes
 * LUCENE-5339: The facet module was simplified/reworked to make the
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
@ -20,11 +20,16 @@ package org.apache.lucene.search.highlight;
 * limitations under the License.
 */
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.Fields;
@ -35,10 +40,6 @@ import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
 /**
 * Hides implementation issues associated with obtaining a TokenStream for use
 * with the higlighter - can obtain from TermFreqVectors with offsets and
@ -169,11 +170,14 @@ public class TokenSources {
      PositionIncrementAttribute posincAtt;
      PayloadAttribute payloadAtt;
      StoredTokenStream(Token tokens[]) {
        this.tokens = tokens;
        termAtt = addAttribute(CharTermAttribute.class);
        offsetAtt = addAttribute(OffsetAttribute.class);
        posincAtt = addAttribute(PositionIncrementAttribute.class);
        payloadAtt = addAttribute(PayloadAttribute.class);
      }
      @Override
@ -185,6 +189,10 @@ public class TokenSources {
        clearAttributes();
        termAtt.setEmpty().append(token);
        offsetAtt.setOffset(token.startOffset(), token.endOffset());
        BytesRef payload = token.getPayload();
        if (payload != null) {
          payloadAtt.setPayload(payload);
        }
        posincAtt
            .setPositionIncrement(currentToken <= 1
                || tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
@ -192,6 +200,9 @@ public class TokenSources {
        return true;
      }
    }
    boolean hasPayloads = tpv.hasPayloads();
    // code to reconstruct the original sequence of Tokens
    TermsEnum termsEnum = tpv.iterator(null);
    int totalTokens = 0;
@ -223,6 +234,13 @@ public class TokenSources {
        final Token token = new Token(term,
                                      dpEnum.startOffset(),
                                      dpEnum.endOffset());
        if (hasPayloads) {
          // Must make a deep copy of the returned payload,
          // since D&PEnum API is allowed to re-use on every
          // call:
          token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
        }
        if (tokenPositionsGuaranteedContiguous && pos != -1) {
          // We have positions stored and a guarantee that the token position
          // information is contiguous
@ -253,9 +271,11 @@ public class TokenSources {
      ArrayUtil.timSort(tokensInOriginalOrder, new Comparator<Token>() {
        @Override
        public int compare(Token t1, Token t2) {
-          if (t1.startOffset() == t2.startOffset()) return t1.endOffset()
+          if (t1.startOffset() == t2.startOffset()) {
-              - t2.endOffset();
+            return t1.endOffset() - t2.endOffset();
-          else return t1.startOffset() - t2.startOffset();
+          } else {
            return t1.startOffset() - t2.startOffset();
          }
        }
      });
    }
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.Terms;
@ -48,6 +49,8 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
  private OffsetAttribute offsetAttribute;
  private PayloadAttribute payloadAttribute;
  /**
   * Constructor.
   * 
@ -59,7 +62,9 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
    termAttribute = addAttribute(CharTermAttribute.class);
    positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
    offsetAttribute = addAttribute(OffsetAttribute.class);
    payloadAttribute = addAttribute(PayloadAttribute.class);
    final boolean hasOffsets = vector.hasOffsets();
    final boolean hasPayloads = vector.hasPayloads();
    final TermsEnum termsEnum = vector.iterator(null);
    BytesRef text;
    DocsAndPositionsEnum dpEnum = null;
@ -79,6 +84,13 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
          token = new Token();
          token.setEmpty().append(text.utf8ToString());
        }
        if (hasPayloads) {
          // Must make a deep copy of the returned payload,
          // since D&PEnum API is allowed to re-use on every
          // call:
          token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
        }
        // Yes - this is the position, not the increment! This is for
        // sorting. This value
        // will be corrected before use.
@ -112,6 +124,7 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
      positionIncrementAttribute.setPositionIncrement(next
          .getPositionIncrement());
      offsetAttribute.setOffset(next.startOffset(), next.endOffset());
      payloadAttribute.setPayload(next.getPayload());
      return true;
    }
    return false;
--- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
@ -17,10 +17,14 @@ package org.apache.lucene.search.highlight;
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@ -29,6 +33,7 @@ import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.DisjunctionMaxQuery;
 import org.apache.lucene.search.IndexSearcher;
@ -38,10 +43,9 @@ import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import java.io.IOException;
 // LUCENE-2874
 public class TokenSourcesTest extends LuceneTestCase {
  private static final String FIELD = "text";
@ -262,7 +266,6 @@ public class TokenSourcesTest extends LuceneTestCase {
  public void testTermVectorWithoutOffsetsThrowsException()
      throws IOException, InvalidTokenOffsetsException {
    final String TEXT = "the fox did not jump";
    final Directory directory = newDirectory();
    final IndexWriter indexWriter = new IndexWriter(directory,
        newIndexWriterConfig(TEST_VERSION_CURRENT, null));
@ -280,8 +283,7 @@ public class TokenSourcesTest extends LuceneTestCase {
    final IndexReader indexReader = DirectoryReader.open(directory);
    try {
      assertEquals(1, indexReader.numDocs());
-      final TokenStream tokenStream = TokenSources
+      TokenSources.getTokenStream(
          .getTokenStream(
              indexReader.getTermVector(0, FIELD),
              false);
      fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
@ -295,5 +297,68 @@ public class TokenSourcesTest extends LuceneTestCase {
    }
  }
  int curOffset;
  /** Just make a token with the text, and set the payload
   *  to the text as well.  Offets increment "naturally". */
  private Token getToken(String text) {
    Token t = new Token(text, curOffset, curOffset+text.length());
    t.setPayload(new BytesRef(text));
    curOffset++;
    return t;
  }
  // LUCENE-5294
  public void testPayloads() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
    myFieldType.setStoreTermVectors(true);
    myFieldType.setStoreTermVectorOffsets(true);
    myFieldType.setStoreTermVectorPositions(true);
    myFieldType.setStoreTermVectorPayloads(true);
    curOffset = 0;
    Token[] tokens = new Token[] {
      getToken("foxes"),
      getToken("can"),
      getToken("jump"),
      getToken("high")
    };
    Document doc = new Document();
    doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    writer.close();
    assertEquals(1, reader.numDocs());
    for(int i=0;i<2;i++) {
      // Do this twice, once passing true and then passing
      // false: they are entirely different code paths
      // under-the-hood:
      TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), i == 0);
      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
      PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
      for(Token token : tokens) {
        assertTrue(ts.incrementToken());
        assertEquals(token.toString(), termAtt.toString());
        assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
        assertEquals(token.getPayload(), payloadAtt.getPayload());
        assertEquals(token.startOffset(), offsetAtt.startOffset());
        assertEquals(token.endOffset(), offsetAtt.endOffset());
      }
      assertFalse(ts.incrementToken());
    }
    reader.close();
    dir.close();
  }
 }