mirror of https://github.com/apache/lucene.git
LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if they were indexed with term vectors
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1557439 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
13084e4e81
commit
bd85dca2af
|
@ -153,6 +153,9 @@ Bug fixes
|
|||
domain-only URLs that are followed by an alphanumeric character.
|
||||
(Chris Geeringh, Steve Rowe)
|
||||
|
||||
* LUCENE-5394: Fix TokenSources.getTokenStream to return payloads if
|
||||
they were indexed with the term vectors. (Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-5339: The facet module was simplified/reworked to make the
|
||||
|
|
|
@ -20,11 +20,16 @@ package org.apache.lucene.search.highlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.Fields;
|
||||
|
@ -35,10 +40,6 @@ import org.apache.lucene.index.TermsEnum;
|
|||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* Hides implementation issues associated with obtaining a TokenStream for use
|
||||
* with the higlighter - can obtain from TermFreqVectors with offsets and
|
||||
|
@ -169,11 +170,14 @@ public class TokenSources {
|
|||
|
||||
PositionIncrementAttribute posincAtt;
|
||||
|
||||
PayloadAttribute payloadAtt;
|
||||
|
||||
StoredTokenStream(Token tokens[]) {
|
||||
this.tokens = tokens;
|
||||
termAtt = addAttribute(CharTermAttribute.class);
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
posincAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -185,6 +189,10 @@ public class TokenSources {
|
|||
clearAttributes();
|
||||
termAtt.setEmpty().append(token);
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
BytesRef payload = token.getPayload();
|
||||
if (payload != null) {
|
||||
payloadAtt.setPayload(payload);
|
||||
}
|
||||
posincAtt
|
||||
.setPositionIncrement(currentToken <= 1
|
||||
|| tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
|
||||
|
@ -192,6 +200,9 @@ public class TokenSources {
|
|||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
boolean hasPayloads = tpv.hasPayloads();
|
||||
|
||||
// code to reconstruct the original sequence of Tokens
|
||||
TermsEnum termsEnum = tpv.iterator(null);
|
||||
int totalTokens = 0;
|
||||
|
@ -223,6 +234,13 @@ public class TokenSources {
|
|||
final Token token = new Token(term,
|
||||
dpEnum.startOffset(),
|
||||
dpEnum.endOffset());
|
||||
if (hasPayloads) {
|
||||
// Must make a deep copy of the returned payload,
|
||||
// since D&PEnum API is allowed to re-use on every
|
||||
// call:
|
||||
token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
|
||||
}
|
||||
|
||||
if (tokenPositionsGuaranteedContiguous && pos != -1) {
|
||||
// We have positions stored and a guarantee that the token position
|
||||
// information is contiguous
|
||||
|
@ -253,9 +271,11 @@ public class TokenSources {
|
|||
ArrayUtil.timSort(tokensInOriginalOrder, new Comparator<Token>() {
|
||||
@Override
|
||||
public int compare(Token t1, Token t2) {
|
||||
if (t1.startOffset() == t2.startOffset()) return t1.endOffset()
|
||||
- t2.endOffset();
|
||||
else return t1.startOffset() - t2.startOffset();
|
||||
if (t1.startOffset() == t2.startOffset()) {
|
||||
return t1.endOffset() - t2.endOffset();
|
||||
} else {
|
||||
return t1.startOffset() - t2.startOffset();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Token;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
|
@ -48,6 +49,8 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
|
|||
|
||||
private OffsetAttribute offsetAttribute;
|
||||
|
||||
private PayloadAttribute payloadAttribute;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
|
@ -59,7 +62,9 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
|
|||
termAttribute = addAttribute(CharTermAttribute.class);
|
||||
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
payloadAttribute = addAttribute(PayloadAttribute.class);
|
||||
final boolean hasOffsets = vector.hasOffsets();
|
||||
final boolean hasPayloads = vector.hasPayloads();
|
||||
final TermsEnum termsEnum = vector.iterator(null);
|
||||
BytesRef text;
|
||||
DocsAndPositionsEnum dpEnum = null;
|
||||
|
@ -79,6 +84,13 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
|
|||
token = new Token();
|
||||
token.setEmpty().append(text.utf8ToString());
|
||||
}
|
||||
if (hasPayloads) {
|
||||
// Must make a deep copy of the returned payload,
|
||||
// since D&PEnum API is allowed to re-use on every
|
||||
// call:
|
||||
token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
|
||||
}
|
||||
|
||||
// Yes - this is the position, not the increment! This is for
|
||||
// sorting. This value
|
||||
// will be corrected before use.
|
||||
|
@ -112,6 +124,7 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
|
|||
positionIncrementAttribute.setPositionIncrement(next
|
||||
.getPositionIncrement());
|
||||
offsetAttribute.setOffset(next.startOffset(), next.endOffset());
|
||||
payloadAttribute.setPayload(next.getPayload());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
|
|
@ -17,10 +17,14 @@ package org.apache.lucene.search.highlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
@ -29,6 +33,7 @@ import org.apache.lucene.document.TextField;
|
|||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
@ -38,10 +43,9 @@ import org.apache.lucene.search.spans.SpanNearQuery;
|
|||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
// LUCENE-2874
|
||||
public class TokenSourcesTest extends LuceneTestCase {
|
||||
private static final String FIELD = "text";
|
||||
|
@ -262,7 +266,6 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
|
||||
public void testTermVectorWithoutOffsetsThrowsException()
|
||||
throws IOException, InvalidTokenOffsetsException {
|
||||
final String TEXT = "the fox did not jump";
|
||||
final Directory directory = newDirectory();
|
||||
final IndexWriter indexWriter = new IndexWriter(directory,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT, null));
|
||||
|
@ -280,8 +283,7 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
final IndexReader indexReader = DirectoryReader.open(directory);
|
||||
try {
|
||||
assertEquals(1, indexReader.numDocs());
|
||||
final TokenStream tokenStream = TokenSources
|
||||
.getTokenStream(
|
||||
TokenSources.getTokenStream(
|
||||
indexReader.getTermVector(0, FIELD),
|
||||
false);
|
||||
fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
|
||||
|
@ -295,5 +297,68 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
int curOffset;
|
||||
|
||||
/** Just make a token with the text, and set the payload
|
||||
* to the text as well. Offets increment "naturally". */
|
||||
private Token getToken(String text) {
|
||||
Token t = new Token(text, curOffset, curOffset+text.length());
|
||||
t.setPayload(new BytesRef(text));
|
||||
curOffset++;
|
||||
return t;
|
||||
}
|
||||
|
||||
// LUCENE-5294
|
||||
public void testPayloads() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
myFieldType.setStoreTermVectors(true);
|
||||
myFieldType.setStoreTermVectorOffsets(true);
|
||||
myFieldType.setStoreTermVectorPositions(true);
|
||||
myFieldType.setStoreTermVectorPayloads(true);
|
||||
|
||||
curOffset = 0;
|
||||
|
||||
Token[] tokens = new Token[] {
|
||||
getToken("foxes"),
|
||||
getToken("can"),
|
||||
getToken("jump"),
|
||||
getToken("high")
|
||||
};
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
|
||||
writer.addDocument(doc);
|
||||
|
||||
IndexReader reader = writer.getReader();
|
||||
writer.close();
|
||||
assertEquals(1, reader.numDocs());
|
||||
|
||||
for(int i=0;i<2;i++) {
|
||||
// Do this twice, once passing true and then passing
|
||||
// false: they are entirely different code paths
|
||||
// under-the-hood:
|
||||
TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), i == 0);
|
||||
|
||||
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
|
||||
PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
|
||||
|
||||
for(Token token : tokens) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(token.toString(), termAtt.toString());
|
||||
assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
|
||||
assertEquals(token.getPayload(), payloadAtt.getPayload());
|
||||
assertEquals(token.startOffset(), offsetAtt.startOffset());
|
||||
assertEquals(token.endOffset(), offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue