From bd85dca2afcc4f493c4a033fffab77c56d01b68e Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Sat, 11 Jan 2014 19:23:13 +0000 Subject: [PATCH] LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if they were indexed with term vectors git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1557439 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 + .../lucene/search/highlight/TokenSources.java | 34 +++++++-- .../TokenStreamFromTermPositionVector.java | 13 ++++ .../search/highlight/TokenSourcesTest.java | 75 +++++++++++++++++-- 4 files changed, 113 insertions(+), 12 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index bb7b7636094..dddd47a616b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -153,6 +153,9 @@ Bug fixes domain-only URLs that are followed by an alphanumeric character. (Chris Geeringh, Steve Rowe) +* LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if + they were indexed with the term vectors. (Mike McCandless) + API Changes * LUCENE-5339: The facet module was simplified/reworked to make the diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java index e0b3c8c95ea..14a8f7f5239 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java @@ -20,11 +20,16 @@ package org.apache.lucene.search.highlight; * limitations under the License. */ +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.Fields; @@ -35,10 +40,6 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Comparator; - /** * Hides implementation issues associated with obtaining a TokenStream for use * with the higlighter - can obtain from TermFreqVectors with offsets and @@ -169,11 +170,14 @@ public class TokenSources { PositionIncrementAttribute posincAtt; + PayloadAttribute payloadAtt; + StoredTokenStream(Token tokens[]) { this.tokens = tokens; termAtt = addAttribute(CharTermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); posincAtt = addAttribute(PositionIncrementAttribute.class); + payloadAtt = addAttribute(PayloadAttribute.class); } @Override @@ -185,6 +189,10 @@ public class TokenSources { clearAttributes(); termAtt.setEmpty().append(token); offsetAtt.setOffset(token.startOffset(), token.endOffset()); + BytesRef payload = token.getPayload(); + if (payload != null) { + payloadAtt.setPayload(payload); + } posincAtt .setPositionIncrement(currentToken <= 1 || tokens[currentToken - 1].startOffset() > tokens[currentToken - 2] @@ -192,6 +200,9 @@ public class TokenSources { return true; } } + + boolean hasPayloads = tpv.hasPayloads(); + // code to reconstruct the original sequence of Tokens TermsEnum termsEnum = tpv.iterator(null); int totalTokens = 0; @@ -223,6 +234,13 @@ public class TokenSources { final Token token = new Token(term, dpEnum.startOffset(), dpEnum.endOffset()); + if (hasPayloads) { + // Must make a deep copy of the returned payload, + // since D&PEnum API is allowed to re-use on every + // call: + token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload())); + } + if (tokenPositionsGuaranteedContiguous && pos != -1) { // We have positions stored and a guarantee that the token position // information is contiguous @@ -253,9 +271,11 @@ public class TokenSources { ArrayUtil.timSort(tokensInOriginalOrder, new Comparator() { @Override public int compare(Token t1, Token t2) { - if (t1.startOffset() == t2.startOffset()) return t1.endOffset() - - t2.endOffset(); - else return t1.startOffset() - t2.startOffset(); + if (t1.startOffset() == t2.startOffset()) { + return t1.endOffset() - t2.endOffset(); + } else { + return t1.startOffset() - t2.startOffset(); + } } }); } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java index 4057bd96950..42db712998e 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.Terms; @@ -48,6 +49,8 @@ public final class TokenStreamFromTermPositionVector extends TokenStream { private OffsetAttribute offsetAttribute; + private PayloadAttribute payloadAttribute; + /** * Constructor. * @@ -59,7 +62,9 @@ public final class TokenStreamFromTermPositionVector extends TokenStream { termAttribute = addAttribute(CharTermAttribute.class); positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); offsetAttribute = addAttribute(OffsetAttribute.class); + payloadAttribute = addAttribute(PayloadAttribute.class); final boolean hasOffsets = vector.hasOffsets(); + final boolean hasPayloads = vector.hasPayloads(); final TermsEnum termsEnum = vector.iterator(null); BytesRef text; DocsAndPositionsEnum dpEnum = null; @@ -79,6 +84,13 @@ public final class TokenStreamFromTermPositionVector extends TokenStream { token = new Token(); token.setEmpty().append(text.utf8ToString()); } + if (hasPayloads) { + // Must make a deep copy of the returned payload, + // since D&PEnum API is allowed to re-use on every + // call: + token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload())); + } + // Yes - this is the position, not the increment! This is for // sorting. This value // will be corrected before use. @@ -112,6 +124,7 @@ public final class TokenStreamFromTermPositionVector extends TokenStream { positionIncrementAttribute.setPositionIncrement(next .getPositionIncrement()); offsetAttribute.setOffset(next.startOffset(), next.endOffset()); + payloadAttribute.setPayload(next.getPayload()); return true; } return false; diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java index 00cc34f6356..0fc5fe31534 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java @@ -17,10 +17,14 @@ package org.apache.lucene.search.highlight; * limitations under the License. */ +import java.io.IOException; + +import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -29,6 +33,7 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.IndexSearcher; @@ -38,10 +43,9 @@ import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; -import java.io.IOException; - // LUCENE-2874 public class TokenSourcesTest extends LuceneTestCase { private static final String FIELD = "text"; @@ -262,7 +266,6 @@ public class TokenSourcesTest extends LuceneTestCase { public void testTermVectorWithoutOffsetsThrowsException() throws IOException, InvalidTokenOffsetsException { - final String TEXT = "the fox did not jump"; final Directory directory = newDirectory(); final IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig(TEST_VERSION_CURRENT, null)); @@ -280,8 +283,7 @@ public class TokenSourcesTest extends LuceneTestCase { final IndexReader indexReader = DirectoryReader.open(directory); try { assertEquals(1, indexReader.numDocs()); - final TokenStream tokenStream = TokenSources - .getTokenStream( + TokenSources.getTokenStream( indexReader.getTermVector(0, FIELD), false); fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets"); @@ -295,5 +297,68 @@ public class TokenSourcesTest extends LuceneTestCase { } } + int curOffset; + /** Just make a token with the text, and set the payload + * to the text as well. Offets increment "naturally". */ + private Token getToken(String text) { + Token t = new Token(text, curOffset, curOffset+text.length()); + t.setPayload(new BytesRef(text)); + curOffset++; + return t; + } + + // LUCENE-5294 + public void testPayloads() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED); + myFieldType.setStoreTermVectors(true); + myFieldType.setStoreTermVectorOffsets(true); + myFieldType.setStoreTermVectorPositions(true); + myFieldType.setStoreTermVectorPayloads(true); + + curOffset = 0; + + Token[] tokens = new Token[] { + getToken("foxes"), + getToken("can"), + getToken("jump"), + getToken("high") + }; + + Document doc = new Document(); + doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType)); + writer.addDocument(doc); + + IndexReader reader = writer.getReader(); + writer.close(); + assertEquals(1, reader.numDocs()); + + for(int i=0;i<2;i++) { + // Do this twice, once passing true and then passing + // false: they are entirely different code paths + // under-the-hood: + TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), i == 0); + + CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); + PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class); + OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class); + PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class); + + for(Token token : tokens) { + assertTrue(ts.incrementToken()); + assertEquals(token.toString(), termAtt.toString()); + assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement()); + assertEquals(token.getPayload(), payloadAtt.getPayload()); + assertEquals(token.startOffset(), offsetAtt.startOffset()); + assertEquals(token.endOffset(), offsetAtt.endOffset()); + } + + assertFalse(ts.incrementToken()); + } + + reader.close(); + dir.close(); + } }