LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if they were indexed with term vectors

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1557439 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-01-11 19:23:13 +00:00
parent 13084e4e81
commit bd85dca2af
4 changed files with 113 additions and 12 deletions

View File

@ -153,6 +153,9 @@ Bug fixes
domain-only URLs that are followed by an alphanumeric character.
(Chris Geeringh, Steve Rowe)
* LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if
they were indexed with the term vectors. (Mike McCandless)
API Changes
* LUCENE-5339: The facet module was simplified/reworked to make the

View File

@ -20,11 +20,16 @@ package org.apache.lucene.search.highlight;
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Fields;
@ -35,10 +40,6 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
/**
* Hides implementation issues associated with obtaining a TokenStream for use
* with the higlighter - can obtain from TermFreqVectors with offsets and
@ -169,11 +170,14 @@ public class TokenSources {
PositionIncrementAttribute posincAtt;
PayloadAttribute payloadAtt;
StoredTokenStream(Token tokens[]) {
this.tokens = tokens;
termAtt = addAttribute(CharTermAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
posincAtt = addAttribute(PositionIncrementAttribute.class);
payloadAtt = addAttribute(PayloadAttribute.class);
}
@Override
@ -185,6 +189,10 @@ public class TokenSources {
clearAttributes();
termAtt.setEmpty().append(token);
offsetAtt.setOffset(token.startOffset(), token.endOffset());
BytesRef payload = token.getPayload();
if (payload != null) {
payloadAtt.setPayload(payload);
}
posincAtt
.setPositionIncrement(currentToken <= 1
|| tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
@ -192,6 +200,9 @@ public class TokenSources {
return true;
}
}
boolean hasPayloads = tpv.hasPayloads();
// code to reconstruct the original sequence of Tokens
TermsEnum termsEnum = tpv.iterator(null);
int totalTokens = 0;
@ -223,6 +234,13 @@ public class TokenSources {
final Token token = new Token(term,
dpEnum.startOffset(),
dpEnum.endOffset());
if (hasPayloads) {
// Must make a deep copy of the returned payload,
// since D&PEnum API is allowed to re-use on every
// call:
token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
}
if (tokenPositionsGuaranteedContiguous && pos != -1) {
// We have positions stored and a guarantee that the token position
// information is contiguous
@ -253,9 +271,11 @@ public class TokenSources {
ArrayUtil.timSort(tokensInOriginalOrder, new Comparator<Token>() {
@Override
public int compare(Token t1, Token t2) {
if (t1.startOffset() == t2.startOffset()) return t1.endOffset()
- t2.endOffset();
else return t1.startOffset() - t2.startOffset();
if (t1.startOffset() == t2.startOffset()) {
return t1.endOffset() - t2.endOffset();
} else {
return t1.startOffset() - t2.startOffset();
}
}
});
}

View File

@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Terms;
@ -48,6 +49,8 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
private OffsetAttribute offsetAttribute;
private PayloadAttribute payloadAttribute;
/**
* Constructor.
*
@ -59,7 +62,9 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
termAttribute = addAttribute(CharTermAttribute.class);
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
offsetAttribute = addAttribute(OffsetAttribute.class);
payloadAttribute = addAttribute(PayloadAttribute.class);
final boolean hasOffsets = vector.hasOffsets();
final boolean hasPayloads = vector.hasPayloads();
final TermsEnum termsEnum = vector.iterator(null);
BytesRef text;
DocsAndPositionsEnum dpEnum = null;
@ -79,6 +84,13 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
token = new Token();
token.setEmpty().append(text.utf8ToString());
}
if (hasPayloads) {
// Must make a deep copy of the returned payload,
// since D&PEnum API is allowed to re-use on every
// call:
token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
}
// Yes - this is the position, not the increment! This is for
// sorting. This value
// will be corrected before use.
@ -112,6 +124,7 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
positionIncrementAttribute.setPositionIncrement(next
.getPositionIncrement());
offsetAttribute.setOffset(next.startOffset(), next.endOffset());
payloadAttribute.setPayload(next.getPayload());
return true;
}
return false;

View File

@ -17,10 +17,14 @@ package org.apache.lucene.search.highlight;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -29,6 +33,7 @@ import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.IndexSearcher;
@ -38,10 +43,9 @@ import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
// LUCENE-2874
public class TokenSourcesTest extends LuceneTestCase {
private static final String FIELD = "text";
@ -262,7 +266,6 @@ public class TokenSourcesTest extends LuceneTestCase {
public void testTermVectorWithoutOffsetsThrowsException()
throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory,
newIndexWriterConfig(TEST_VERSION_CURRENT, null));
@ -280,8 +283,7 @@ public class TokenSourcesTest extends LuceneTestCase {
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final TokenStream tokenStream = TokenSources
.getTokenStream(
TokenSources.getTokenStream(
indexReader.getTermVector(0, FIELD),
false);
fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
@ -295,5 +297,68 @@ public class TokenSourcesTest extends LuceneTestCase {
}
}
int curOffset;
/** Just make a token with the text, and set the payload
* to the text as well. Offets increment "naturally". */
private Token getToken(String text) {
Token t = new Token(text, curOffset, curOffset+text.length());
t.setPayload(new BytesRef(text));
curOffset++;
return t;
}
// LUCENE-5294
public void testPayloads() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
myFieldType.setStoreTermVectors(true);
myFieldType.setStoreTermVectorOffsets(true);
myFieldType.setStoreTermVectorPositions(true);
myFieldType.setStoreTermVectorPayloads(true);
curOffset = 0;
Token[] tokens = new Token[] {
getToken("foxes"),
getToken("can"),
getToken("jump"),
getToken("high")
};
Document doc = new Document();
doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
writer.close();
assertEquals(1, reader.numDocs());
for(int i=0;i<2;i++) {
// Do this twice, once passing true and then passing
// false: they are entirely different code paths
// under-the-hood:
TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), i == 0);
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
for(Token token : tokens) {
assertTrue(ts.incrementToken());
assertEquals(token.toString(), termAtt.toString());
assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
assertEquals(token.getPayload(), payloadAtt.getPayload());
assertEquals(token.startOffset(), offsetAtt.startOffset());
assertEquals(token.endOffset(), offsetAtt.endOffset());
}
assertFalse(ts.incrementToken());
}
reader.close();
dir.close();
}
}