mirror of https://github.com/apache/lucene.git
LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if they were indexed with term vectors
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1557439 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
13084e4e81
commit
bd85dca2af
|
@ -153,6 +153,9 @@ Bug fixes
|
||||||
domain-only URLs that are followed by an alphanumeric character.
|
domain-only URLs that are followed by an alphanumeric character.
|
||||||
(Chris Geeringh, Steve Rowe)
|
(Chris Geeringh, Steve Rowe)
|
||||||
|
|
||||||
|
* LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if
|
||||||
|
they were indexed with the term vectors. (Mike McCandless)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-5339: The facet module was simplified/reworked to make the
|
* LUCENE-5339: The facet module was simplified/reworked to make the
|
||||||
|
|
|
@ -20,11 +20,16 @@ package org.apache.lucene.search.highlight;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||||
import org.apache.lucene.index.Fields;
|
import org.apache.lucene.index.Fields;
|
||||||
|
@ -35,10 +40,6 @@ import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Comparator;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Hides implementation issues associated with obtaining a TokenStream for use
|
* Hides implementation issues associated with obtaining a TokenStream for use
|
||||||
* with the higlighter - can obtain from TermFreqVectors with offsets and
|
* with the higlighter - can obtain from TermFreqVectors with offsets and
|
||||||
|
@ -169,11 +170,14 @@ public class TokenSources {
|
||||||
|
|
||||||
PositionIncrementAttribute posincAtt;
|
PositionIncrementAttribute posincAtt;
|
||||||
|
|
||||||
|
PayloadAttribute payloadAtt;
|
||||||
|
|
||||||
StoredTokenStream(Token tokens[]) {
|
StoredTokenStream(Token tokens[]) {
|
||||||
this.tokens = tokens;
|
this.tokens = tokens;
|
||||||
termAtt = addAttribute(CharTermAttribute.class);
|
termAtt = addAttribute(CharTermAttribute.class);
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
posincAtt = addAttribute(PositionIncrementAttribute.class);
|
posincAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
payloadAtt = addAttribute(PayloadAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -185,6 +189,10 @@ public class TokenSources {
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setEmpty().append(token);
|
termAtt.setEmpty().append(token);
|
||||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||||
|
BytesRef payload = token.getPayload();
|
||||||
|
if (payload != null) {
|
||||||
|
payloadAtt.setPayload(payload);
|
||||||
|
}
|
||||||
posincAtt
|
posincAtt
|
||||||
.setPositionIncrement(currentToken <= 1
|
.setPositionIncrement(currentToken <= 1
|
||||||
|| tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
|
|| tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
|
||||||
|
@ -192,6 +200,9 @@ public class TokenSources {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boolean hasPayloads = tpv.hasPayloads();
|
||||||
|
|
||||||
// code to reconstruct the original sequence of Tokens
|
// code to reconstruct the original sequence of Tokens
|
||||||
TermsEnum termsEnum = tpv.iterator(null);
|
TermsEnum termsEnum = tpv.iterator(null);
|
||||||
int totalTokens = 0;
|
int totalTokens = 0;
|
||||||
|
@ -223,6 +234,13 @@ public class TokenSources {
|
||||||
final Token token = new Token(term,
|
final Token token = new Token(term,
|
||||||
dpEnum.startOffset(),
|
dpEnum.startOffset(),
|
||||||
dpEnum.endOffset());
|
dpEnum.endOffset());
|
||||||
|
if (hasPayloads) {
|
||||||
|
// Must make a deep copy of the returned payload,
|
||||||
|
// since D&PEnum API is allowed to re-use on every
|
||||||
|
// call:
|
||||||
|
token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
|
||||||
|
}
|
||||||
|
|
||||||
if (tokenPositionsGuaranteedContiguous && pos != -1) {
|
if (tokenPositionsGuaranteedContiguous && pos != -1) {
|
||||||
// We have positions stored and a guarantee that the token position
|
// We have positions stored and a guarantee that the token position
|
||||||
// information is contiguous
|
// information is contiguous
|
||||||
|
@ -253,9 +271,11 @@ public class TokenSources {
|
||||||
ArrayUtil.timSort(tokensInOriginalOrder, new Comparator<Token>() {
|
ArrayUtil.timSort(tokensInOriginalOrder, new Comparator<Token>() {
|
||||||
@Override
|
@Override
|
||||||
public int compare(Token t1, Token t2) {
|
public int compare(Token t1, Token t2) {
|
||||||
if (t1.startOffset() == t2.startOffset()) return t1.endOffset()
|
if (t1.startOffset() == t2.startOffset()) {
|
||||||
- t2.endOffset();
|
return t1.endOffset() - t2.endOffset();
|
||||||
else return t1.startOffset() - t2.startOffset();
|
} else {
|
||||||
|
return t1.startOffset() - t2.startOffset();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
|
@ -48,6 +49,8 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
|
||||||
|
|
||||||
private OffsetAttribute offsetAttribute;
|
private OffsetAttribute offsetAttribute;
|
||||||
|
|
||||||
|
private PayloadAttribute payloadAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor.
|
* Constructor.
|
||||||
*
|
*
|
||||||
|
@ -59,7 +62,9 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
|
||||||
termAttribute = addAttribute(CharTermAttribute.class);
|
termAttribute = addAttribute(CharTermAttribute.class);
|
||||||
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
|
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||||
offsetAttribute = addAttribute(OffsetAttribute.class);
|
offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||||
|
payloadAttribute = addAttribute(PayloadAttribute.class);
|
||||||
final boolean hasOffsets = vector.hasOffsets();
|
final boolean hasOffsets = vector.hasOffsets();
|
||||||
|
final boolean hasPayloads = vector.hasPayloads();
|
||||||
final TermsEnum termsEnum = vector.iterator(null);
|
final TermsEnum termsEnum = vector.iterator(null);
|
||||||
BytesRef text;
|
BytesRef text;
|
||||||
DocsAndPositionsEnum dpEnum = null;
|
DocsAndPositionsEnum dpEnum = null;
|
||||||
|
@ -79,6 +84,13 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
|
||||||
token = new Token();
|
token = new Token();
|
||||||
token.setEmpty().append(text.utf8ToString());
|
token.setEmpty().append(text.utf8ToString());
|
||||||
}
|
}
|
||||||
|
if (hasPayloads) {
|
||||||
|
// Must make a deep copy of the returned payload,
|
||||||
|
// since D&PEnum API is allowed to re-use on every
|
||||||
|
// call:
|
||||||
|
token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
|
||||||
|
}
|
||||||
|
|
||||||
// Yes - this is the position, not the increment! This is for
|
// Yes - this is the position, not the increment! This is for
|
||||||
// sorting. This value
|
// sorting. This value
|
||||||
// will be corrected before use.
|
// will be corrected before use.
|
||||||
|
@ -112,6 +124,7 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
|
||||||
positionIncrementAttribute.setPositionIncrement(next
|
positionIncrementAttribute.setPositionIncrement(next
|
||||||
.getPositionIncrement());
|
.getPositionIncrement());
|
||||||
offsetAttribute.setOffset(next.startOffset(), next.endOffset());
|
offsetAttribute.setOffset(next.startOffset(), next.endOffset());
|
||||||
|
payloadAttribute.setPayload(next.getPayload());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -17,10 +17,14 @@ package org.apache.lucene.search.highlight;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CannedTokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
@ -29,6 +33,7 @@ import org.apache.lucene.document.TextField;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
@ -38,10 +43,9 @@ import org.apache.lucene.search.spans.SpanNearQuery;
|
||||||
import org.apache.lucene.search.spans.SpanQuery;
|
import org.apache.lucene.search.spans.SpanQuery;
|
||||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
// LUCENE-2874
|
// LUCENE-2874
|
||||||
public class TokenSourcesTest extends LuceneTestCase {
|
public class TokenSourcesTest extends LuceneTestCase {
|
||||||
private static final String FIELD = "text";
|
private static final String FIELD = "text";
|
||||||
|
@ -262,7 +266,6 @@ public class TokenSourcesTest extends LuceneTestCase {
|
||||||
|
|
||||||
public void testTermVectorWithoutOffsetsThrowsException()
|
public void testTermVectorWithoutOffsetsThrowsException()
|
||||||
throws IOException, InvalidTokenOffsetsException {
|
throws IOException, InvalidTokenOffsetsException {
|
||||||
final String TEXT = "the fox did not jump";
|
|
||||||
final Directory directory = newDirectory();
|
final Directory directory = newDirectory();
|
||||||
final IndexWriter indexWriter = new IndexWriter(directory,
|
final IndexWriter indexWriter = new IndexWriter(directory,
|
||||||
newIndexWriterConfig(TEST_VERSION_CURRENT, null));
|
newIndexWriterConfig(TEST_VERSION_CURRENT, null));
|
||||||
|
@ -280,8 +283,7 @@ public class TokenSourcesTest extends LuceneTestCase {
|
||||||
final IndexReader indexReader = DirectoryReader.open(directory);
|
final IndexReader indexReader = DirectoryReader.open(directory);
|
||||||
try {
|
try {
|
||||||
assertEquals(1, indexReader.numDocs());
|
assertEquals(1, indexReader.numDocs());
|
||||||
final TokenStream tokenStream = TokenSources
|
TokenSources.getTokenStream(
|
||||||
.getTokenStream(
|
|
||||||
indexReader.getTermVector(0, FIELD),
|
indexReader.getTermVector(0, FIELD),
|
||||||
false);
|
false);
|
||||||
fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
|
fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
|
||||||
|
@ -295,5 +297,68 @@ public class TokenSourcesTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int curOffset;
|
||||||
|
|
||||||
|
/** Just make a token with the text, and set the payload
|
||||||
|
* to the text as well. Offets increment "naturally". */
|
||||||
|
private Token getToken(String text) {
|
||||||
|
Token t = new Token(text, curOffset, curOffset+text.length());
|
||||||
|
t.setPayload(new BytesRef(text));
|
||||||
|
curOffset++;
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
// LUCENE-5294
|
||||||
|
public void testPayloads() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||||
|
FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
|
myFieldType.setStoreTermVectors(true);
|
||||||
|
myFieldType.setStoreTermVectorOffsets(true);
|
||||||
|
myFieldType.setStoreTermVectorPositions(true);
|
||||||
|
myFieldType.setStoreTermVectorPayloads(true);
|
||||||
|
|
||||||
|
curOffset = 0;
|
||||||
|
|
||||||
|
Token[] tokens = new Token[] {
|
||||||
|
getToken("foxes"),
|
||||||
|
getToken("can"),
|
||||||
|
getToken("jump"),
|
||||||
|
getToken("high")
|
||||||
|
};
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader reader = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
assertEquals(1, reader.numDocs());
|
||||||
|
|
||||||
|
for(int i=0;i<2;i++) {
|
||||||
|
// Do this twice, once passing true and then passing
|
||||||
|
// false: they are entirely different code paths
|
||||||
|
// under-the-hood:
|
||||||
|
TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), i == 0);
|
||||||
|
|
||||||
|
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||||
|
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
|
||||||
|
OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
|
||||||
|
PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
|
||||||
|
|
||||||
|
for(Token token : tokens) {
|
||||||
|
assertTrue(ts.incrementToken());
|
||||||
|
assertEquals(token.toString(), termAtt.toString());
|
||||||
|
assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
|
||||||
|
assertEquals(token.getPayload(), payloadAtt.getPayload());
|
||||||
|
assertEquals(token.startOffset(), offsetAtt.startOffset());
|
||||||
|
assertEquals(token.endOffset(), offsetAtt.endOffset());
|
||||||
|
}
|
||||||
|
|
||||||
|
assertFalse(ts.incrementToken());
|
||||||
|
}
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue