mirror of https://github.com/apache/lucene.git
LUCENE-6031: Optimize TokenSources term vector to TokenStream
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1642294 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7d4cd4d9bb
commit
6083352aa7
|
@ -347,6 +347,11 @@ Optimizations
|
|||
* LUCENE-6033: CachingTokenFilter now uses ArrayList not LinkedList, and has new
|
||||
isCached() method. (David Smiley)
|
||||
|
||||
* LUCENE-6031: TokenSources (in the default highlighter) converts term vectors into a
|
||||
TokenStream much faster in linear time (not N*log(N) using less memory, and with reset()
|
||||
implemented. Only one of offsets or positions are required of the term vector.
|
||||
(David Smiley)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-5909: Smoke tester now has better command line parsing and
|
||||
|
|
|
@ -21,24 +21,13 @@ package org.apache.lucene.search.highlight;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.StoredDocument;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Hides implementation issues associated with obtaining a TokenStream for use
|
||||
|
@ -113,184 +102,47 @@ public class TokenSources {
|
|||
return ts;
|
||||
}
|
||||
|
||||
public static TokenStream getTokenStream(Terms vector) throws IOException {
|
||||
// assumes the worst and makes no assumptions about token position
|
||||
// sequences.
|
||||
return getTokenStream(vector, false);
|
||||
/** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms)} now. */
|
||||
@Deprecated
|
||||
public static TokenStream getTokenStream(Terms vector,
|
||||
boolean tokenPositionsGuaranteedContiguous) throws IOException {
|
||||
return getTokenStream(vector);
|
||||
}
|
||||
|
||||
/**
|
||||
* Low level api. Returns a token stream generated from a {@link Terms}. This
|
||||
* Returns a token stream generated from a {@link Terms}. This
|
||||
* can be used to feed the highlighter with a pre-parsed token
|
||||
* stream. The {@link Terms} must have offsets available.
|
||||
*
|
||||
* In my tests the speeds to recreate 1000 token streams using this method
|
||||
* are: - with TermVector offset only data stored - 420 milliseconds - with
|
||||
* TermVector offset AND position data stored - 271 milliseconds (nb timings
|
||||
* for TermVector with position data are based on a tokenizer with contiguous
|
||||
* positions - no overlaps or gaps) The cost of not using TermPositionVector
|
||||
* to store pre-parsed content and using an analyzer to re-parse the original
|
||||
* content: - reanalyzing the original content - 980 milliseconds
|
||||
*
|
||||
* The re-analyze timings will typically vary depending on - 1) The complexity
|
||||
* of the analyzer code (timings above were using a
|
||||
* stemmer/lowercaser/stopword combo) 2) The number of other fields (Lucene
|
||||
* reads ALL fields off the disk when accessing just one document field - can
|
||||
* cost dear!) 3) Use of compression on field storage - could be faster due to
|
||||
* compression (less disk IO) or slower (more CPU burn) depending on the
|
||||
* content.
|
||||
*
|
||||
* @param tokenPositionsGuaranteedContiguous true if the token position
|
||||
* numbers have no overlaps or gaps. If looking to eek out the last
|
||||
* drops of performance, set to true. If in doubt, set to false.
|
||||
* stream. The {@link Terms} must have offsets available. If there are no positions available,
|
||||
* all tokens will have position increments reflecting adjacent tokens, or coincident when terms
|
||||
* share a start offset. If there are stopwords filtered from the index, you probably want to ensure
|
||||
* term vectors have positions so that phrase queries won't match across stopwords.
|
||||
*
|
||||
* @throws IllegalArgumentException if no offsets are available
|
||||
*/
|
||||
public static TokenStream getTokenStream(Terms tpv,
|
||||
boolean tokenPositionsGuaranteedContiguous)
|
||||
throws IOException {
|
||||
public static TokenStream getTokenStream(final Terms tpv) throws IOException {
|
||||
|
||||
if (!tpv.hasOffsets()) {
|
||||
throw new IllegalArgumentException("Cannot create TokenStream from Terms without offsets");
|
||||
throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream.");
|
||||
//TokenStreamFromTermPositionVector can handle a lack of offsets if there are positions. But
|
||||
// highlighters require offsets, so we insist here.
|
||||
}
|
||||
|
||||
if (!tokenPositionsGuaranteedContiguous && tpv.hasPositions()) {
|
||||
return new TokenStreamFromTermPositionVector(tpv);
|
||||
}
|
||||
|
||||
// an object used to iterate across an array of tokens
|
||||
final class StoredTokenStream extends TokenStream {
|
||||
Token tokens[];
|
||||
|
||||
int currentToken = 0;
|
||||
|
||||
CharTermAttribute termAtt;
|
||||
|
||||
OffsetAttribute offsetAtt;
|
||||
|
||||
PositionIncrementAttribute posincAtt;
|
||||
|
||||
PayloadAttribute payloadAtt;
|
||||
|
||||
StoredTokenStream(Token tokens[]) {
|
||||
this.tokens = tokens;
|
||||
termAtt = addAttribute(CharTermAttribute.class);
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
posincAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() {
|
||||
if (currentToken >= tokens.length) {
|
||||
return false;
|
||||
}
|
||||
Token token = tokens[currentToken++];
|
||||
clearAttributes();
|
||||
termAtt.setEmpty().append(token);
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
BytesRef payload = token.getPayload();
|
||||
if (payload != null) {
|
||||
payloadAtt.setPayload(payload);
|
||||
}
|
||||
posincAtt
|
||||
.setPositionIncrement(currentToken <= 1
|
||||
|| tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
|
||||
.startOffset() ? 1 : 0);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
boolean hasPayloads = tpv.hasPayloads();
|
||||
|
||||
// code to reconstruct the original sequence of Tokens
|
||||
TermsEnum termsEnum = tpv.iterator(null);
|
||||
int totalTokens = 0;
|
||||
while(termsEnum.next() != null) {
|
||||
totalTokens += (int) termsEnum.totalTermFreq();
|
||||
}
|
||||
Token tokensInOriginalOrder[] = new Token[totalTokens];
|
||||
ArrayList<Token> unsortedTokens = null;
|
||||
termsEnum = tpv.iterator(null);
|
||||
BytesRef text;
|
||||
DocsAndPositionsEnum dpEnum = null;
|
||||
while ((text = termsEnum.next()) != null) {
|
||||
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
if (dpEnum == null) {
|
||||
throw new IllegalArgumentException(
|
||||
"Required TermVector Offset information was not found");
|
||||
}
|
||||
final String term = text.utf8ToString();
|
||||
|
||||
dpEnum.nextDoc();
|
||||
final int freq = dpEnum.freq();
|
||||
for(int posUpto=0;posUpto<freq;posUpto++) {
|
||||
final int pos = dpEnum.nextPosition();
|
||||
if (dpEnum.startOffset() < 0) {
|
||||
throw new IllegalArgumentException(
|
||||
"Required TermVector Offset information was not found");
|
||||
}
|
||||
final Token token = new Token(term,
|
||||
dpEnum.startOffset(),
|
||||
dpEnum.endOffset());
|
||||
if (hasPayloads) {
|
||||
// Must make a deep copy of the returned payload,
|
||||
// since D&PEnum API is allowed to re-use on every
|
||||
// call:
|
||||
token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
|
||||
}
|
||||
|
||||
if (tokenPositionsGuaranteedContiguous && pos != -1) {
|
||||
// We have positions stored and a guarantee that the token position
|
||||
// information is contiguous
|
||||
|
||||
// This may be fast BUT wont work if Tokenizers used which create >1
|
||||
// token in same position or
|
||||
// creates jumps in position numbers - this code would fail under those
|
||||
// circumstances
|
||||
|
||||
// tokens stored with positions - can use this to index straight into
|
||||
// sorted array
|
||||
tokensInOriginalOrder[pos] = token;
|
||||
} else {
|
||||
// tokens NOT stored with positions or not guaranteed contiguous - must
|
||||
// add to list and sort later
|
||||
if (unsortedTokens == null) {
|
||||
unsortedTokens = new ArrayList<>();
|
||||
}
|
||||
unsortedTokens.add(token);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If the field has been stored without position data we must perform a sort
|
||||
if (unsortedTokens != null) {
|
||||
tokensInOriginalOrder = unsortedTokens.toArray(new Token[unsortedTokens
|
||||
.size()]);
|
||||
ArrayUtil.timSort(tokensInOriginalOrder, new Comparator<Token>() {
|
||||
@Override
|
||||
public int compare(Token t1, Token t2) {
|
||||
if (t1.startOffset() == t2.startOffset()) {
|
||||
return t1.endOffset() - t2.endOffset();
|
||||
} else {
|
||||
return t1.startOffset() - t2.startOffset();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
return new StoredTokenStream(tokensInOriginalOrder);
|
||||
return new TokenStreamFromTermPositionVector(tpv);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a {@link TokenStream} with positions and offsets constructed from
|
||||
* field termvectors. If the field has no termvectors, or positions or offsets
|
||||
* are not included in the termvector, return null.
|
||||
* field termvectors. If the field has no termvectors or offsets
|
||||
* are not included in the termvector, return null. See {@link #getTokenStream(org.apache.lucene.index.Terms)}
|
||||
* for an explanation of what happens when positions aren't present.
|
||||
*
|
||||
* @param reader the {@link IndexReader} to retrieve term vectors from
|
||||
* @param docId the document to retrieve termvectors for
|
||||
* @param field the field to retrieve termvectors for
|
||||
* @return a {@link TokenStream}, or null if positions and offsets are not available
|
||||
* @return a {@link TokenStream}, or null if offsets are not available
|
||||
* @throws IOException If there is a low-level I/O error
|
||||
*
|
||||
* @see #getTokenStream(org.apache.lucene.index.Terms)
|
||||
*/
|
||||
public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
|
||||
String field) throws IOException {
|
||||
|
@ -305,7 +157,7 @@ public class TokenSources {
|
|||
return null;
|
||||
}
|
||||
|
||||
if (!vector.hasPositions() || !vector.hasOffsets()) {
|
||||
if (!vector.hasOffsets()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,12 +17,7 @@ package org.apache.lucene.search.highlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
@ -32,106 +27,257 @@ import org.apache.lucene.index.DocsAndPositionsEnum;
|
|||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CollectionUtil;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/**
|
||||
* TokenStream created from a term vector field.
|
||||
* TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
|
||||
* want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
|
||||
* because you know the term vector has payloads. This TokenStream supports an efficient {@link #reset()}, so there's
|
||||
* no need to wrap with a caching impl.
|
||||
* <p />
|
||||
* The implementation will create an array of tokens indexed by token position. As long as there aren't massive jumps
|
||||
* in positions, this is fine. And it assumes there aren't large numbers of tokens at the same position, since it adds
|
||||
* them to a linked-list per position in O(N^2) complexity. When there aren't positions in the term vector, it divides
|
||||
* the startOffset by 8 to use as a temporary substitute. In that case, tokens with the same startOffset will occupy
|
||||
* the same final position; otherwise tokens become adjacent.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
//TODO rename to TokenStreamFromTermVector
|
||||
public final class TokenStreamFromTermPositionVector extends TokenStream {
|
||||
|
||||
private final List<Token> positionedTokens = new ArrayList<>();
|
||||
//TODO add a maxStartOffset filter, which highlighters will find handy
|
||||
|
||||
private Iterator<Token> tokensAtCurrentPosition;
|
||||
private final Terms vector;
|
||||
|
||||
private CharTermAttribute termAttribute;
|
||||
private final CharTermAttribute termAttribute;
|
||||
|
||||
private PositionIncrementAttribute positionIncrementAttribute;
|
||||
private final PositionIncrementAttribute positionIncrementAttribute;
|
||||
|
||||
private OffsetAttribute offsetAttribute;
|
||||
private OffsetAttribute offsetAttribute;//maybe null
|
||||
|
||||
private PayloadAttribute payloadAttribute;
|
||||
private PayloadAttribute payloadAttribute;//maybe null
|
||||
|
||||
private TokenLL firstToken = null; // the head of a linked-list
|
||||
|
||||
private TokenLL incrementToken = null;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param vector Terms that contains the data for
|
||||
* creating the TokenStream. Must have positions and offsets.
|
||||
* creating the TokenStream. Must have positions and/or offsets.
|
||||
*/
|
||||
public TokenStreamFromTermPositionVector(
|
||||
final Terms vector) throws IOException {
|
||||
public TokenStreamFromTermPositionVector(Terms vector) throws IOException {
|
||||
if (!vector.hasPositions() && !vector.hasOffsets()) {
|
||||
throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
|
||||
}
|
||||
assert vector.hasFreqs();
|
||||
this.vector = vector;
|
||||
termAttribute = addAttribute(CharTermAttribute.class);
|
||||
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
payloadAttribute = addAttribute(PayloadAttribute.class);
|
||||
final boolean hasOffsets = vector.hasOffsets();
|
||||
final boolean hasPayloads = vector.hasPayloads();
|
||||
}
|
||||
|
||||
public Terms getTermVectorTerms() { return vector; }
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
if (firstToken == null) {//just the first time
|
||||
init();
|
||||
}
|
||||
incrementToken = null;
|
||||
super.reset();
|
||||
}
|
||||
|
||||
//We initialize in reset() because we can see which attributes the consumer wants, particularly payloads
|
||||
private void init() throws IOException {
|
||||
if (vector.hasOffsets()) {
|
||||
offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
|
||||
payloadAttribute = getAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
// Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
|
||||
|
||||
TokenLL[] positionedTokens = initTokensArray();
|
||||
|
||||
int lastPosition = -1;
|
||||
|
||||
final TermsEnum termsEnum = vector.iterator(null);
|
||||
BytesRef text;
|
||||
BytesRef termBytesRef;
|
||||
DocsAndPositionsEnum dpEnum = null;
|
||||
while((text = termsEnum.next()) != null) {
|
||||
//int sumFreq = 0;
|
||||
while ((termBytesRef = termsEnum.next()) != null) {
|
||||
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
|
||||
// note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
|
||||
final char[] termChars = new char[termBytesRef.length];
|
||||
final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, termChars);
|
||||
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
|
||||
dpEnum.nextDoc();
|
||||
final int freq = dpEnum.freq();
|
||||
//sumFreq += freq;
|
||||
for (int j = 0; j < freq; j++) {
|
||||
int pos = dpEnum.nextPosition();
|
||||
Token token;
|
||||
if (hasOffsets) {
|
||||
token = new Token(text.utf8ToString(),
|
||||
dpEnum.startOffset(),
|
||||
dpEnum.endOffset());
|
||||
} else {
|
||||
token = new Token();
|
||||
token.setEmpty().append(text.utf8ToString());
|
||||
TokenLL token = new TokenLL();
|
||||
token.termChars = termChars;
|
||||
token.termCharsLen = termCharsLen;
|
||||
if (offsetAttribute != null) {
|
||||
token.startOffset = dpEnum.startOffset();
|
||||
token.endOffset = dpEnum.endOffset();
|
||||
if (pos == -1) {
|
||||
pos = token.startOffset >> 3;//divide by 8
|
||||
}
|
||||
}
|
||||
if (hasPayloads) {
|
||||
|
||||
if (payloadAttribute != null) {
|
||||
// Must make a deep copy of the returned payload,
|
||||
// since D&PEnum API is allowed to re-use on every
|
||||
// call:
|
||||
token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
|
||||
final BytesRef payload = dpEnum.getPayload();
|
||||
if (payload != null) {
|
||||
token.payload = BytesRef.deepCopyOf(payload);//TODO share a ByteBlockPool & re-use BytesRef
|
||||
}
|
||||
}
|
||||
|
||||
// Yes - this is the position, not the increment! This is for
|
||||
// sorting. This value
|
||||
// will be corrected before use.
|
||||
token.setPositionIncrement(pos);
|
||||
this.positionedTokens.add(token);
|
||||
//Add token to an array indexed by position
|
||||
if (positionedTokens.length <= pos) {
|
||||
//grow, but not 2x since we think our original length estimate is close
|
||||
TokenLL[] newPositionedTokens = new TokenLL[(int)((pos + 1) * 1.5f)];
|
||||
System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
|
||||
positionedTokens = newPositionedTokens;
|
||||
}
|
||||
positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]);
|
||||
|
||||
lastPosition = Math.max(lastPosition, pos);
|
||||
}
|
||||
}
|
||||
CollectionUtil.timSort(this.positionedTokens, tokenComparator);
|
||||
int lastPosition = -1;
|
||||
for (final Token token : this.positionedTokens) {
|
||||
int thisPosition = token.getPositionIncrement();
|
||||
token.setPositionIncrement(thisPosition - lastPosition);
|
||||
lastPosition = thisPosition;
|
||||
|
||||
// System.out.println(String.format(
|
||||
// "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
|
||||
// sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
|
||||
// (originalPositionEstimate/(lastPosition + 1.0f))));
|
||||
|
||||
// Step 2: Link all Tokens into a linked-list and set position increments as we go
|
||||
|
||||
int prevTokenPos = -1;
|
||||
TokenLL prevToken = null;
|
||||
for (int pos = 0; pos <= lastPosition; pos++) {
|
||||
TokenLL token = positionedTokens[pos];
|
||||
if (token == null) {
|
||||
continue;
|
||||
}
|
||||
//link
|
||||
if (prevToken != null) {
|
||||
assert prevToken.next == null;
|
||||
prevToken.next = token; //concatenate linked-list
|
||||
} else {
|
||||
assert firstToken == null;
|
||||
firstToken = token;
|
||||
}
|
||||
//set increments
|
||||
if (vector.hasPositions()) {
|
||||
token.positionIncrement = pos - prevTokenPos;
|
||||
while (token.next != null) {
|
||||
token = token.next;
|
||||
token.positionIncrement = 0;
|
||||
}
|
||||
} else {
|
||||
token.positionIncrement = 1;
|
||||
while (token.next != null) {
|
||||
prevToken = token;
|
||||
token = token.next;
|
||||
if (prevToken.startOffset == token.startOffset) {
|
||||
token.positionIncrement = 0;
|
||||
} else {
|
||||
token.positionIncrement = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
prevTokenPos = pos;
|
||||
prevToken = token;
|
||||
}
|
||||
this.tokensAtCurrentPosition = this.positionedTokens.iterator();
|
||||
}
|
||||
|
||||
private static final Comparator<Token> tokenComparator = new Comparator<Token>() {
|
||||
@Override
|
||||
public int compare(final Token o1, final Token o2) {
|
||||
return o1.getPositionIncrement() - o2.getPositionIncrement();
|
||||
private TokenLL[] initTokensArray() throws IOException {
|
||||
// Estimate the number of position slots we need. We use some estimation factors taken from Wikipedia
|
||||
// that reduce the likelihood of needing to expand the array.
|
||||
int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
|
||||
if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
|
||||
int size = (int) vector.size();
|
||||
if (size == -1) {//doesn't happen with term vectors, it seems, but pick a default any way
|
||||
size = 128;
|
||||
}
|
||||
sumTotalTermFreq = (int)(size * 2.4);
|
||||
}
|
||||
};
|
||||
|
||||
final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
|
||||
return new TokenLL[originalPositionEstimate];
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() {
|
||||
if (this.tokensAtCurrentPosition.hasNext()) {
|
||||
final Token next = this.tokensAtCurrentPosition.next();
|
||||
clearAttributes();
|
||||
termAttribute.setEmpty().append(next);
|
||||
positionIncrementAttribute.setPositionIncrement(next
|
||||
.getPositionIncrement());
|
||||
offsetAttribute.setOffset(next.startOffset(), next.endOffset());
|
||||
payloadAttribute.setPayload(next.getPayload());
|
||||
return true;
|
||||
if (incrementToken == null) {
|
||||
incrementToken = firstToken;
|
||||
if (incrementToken == null) {
|
||||
return false;
|
||||
}
|
||||
} else if (incrementToken.next != null) {
|
||||
incrementToken = incrementToken.next;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
clearAttributes();
|
||||
termAttribute.copyBuffer(incrementToken.termChars, 0, incrementToken.termCharsLen);
|
||||
positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement);
|
||||
if (offsetAttribute != null) {
|
||||
offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset);
|
||||
}
|
||||
if (payloadAttribute != null) {
|
||||
payloadAttribute.setPayload(incrementToken.payload);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
this.tokensAtCurrentPosition = this.positionedTokens.iterator();
|
||||
private static class TokenLL {
|
||||
char[] termChars;
|
||||
int termCharsLen;
|
||||
int positionIncrement;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
BytesRef payload;
|
||||
|
||||
TokenLL next;
|
||||
|
||||
/** Given the head of a linked-list (possibly null) this inserts the token at the correct
|
||||
* spot to maintain the desired order, and returns the head (which could be this token if it's the smallest).
|
||||
* O(N^2) complexity but N should be a handful at most.
|
||||
*/
|
||||
TokenLL insertIntoSortedLinkedList(final TokenLL head) {
|
||||
assert next == null;
|
||||
if (head == null) {
|
||||
return this;
|
||||
} else if (this.compareOffsets(head) <= 0) {
|
||||
this.next = head;
|
||||
return this;
|
||||
}
|
||||
TokenLL prev = head;
|
||||
while (prev.next != null && this.compareOffsets(prev.next) > 0) {
|
||||
prev = prev.next;
|
||||
}
|
||||
this.next = prev.next;
|
||||
prev.next = this;
|
||||
return head;
|
||||
}
|
||||
|
||||
/** by startOffset then endOffset */
|
||||
int compareOffsets(TokenLL tokenB) {
|
||||
int cmp = Integer.compare(this.startOffset, tokenB.startOffset);
|
||||
if (cmp == 0) {
|
||||
cmp = Integer.compare(this.endOffset, tokenB.endOffset);
|
||||
}
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.search.highlight;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.annotations.Repeat;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -30,6 +32,7 @@ import org.apache.lucene.document.Document;
|
|||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
|
@ -44,10 +47,10 @@ import org.apache.lucene.search.spans.SpanQuery;
|
|||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
// LUCENE-2874
|
||||
public class TokenSourcesTest extends LuceneTestCase {
|
||||
public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
||||
private static final String FIELD = "text";
|
||||
|
||||
private static final class OverlappingTokenStream extends TokenStream {
|
||||
|
@ -121,8 +124,7 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
new QueryScorer(query));
|
||||
final TokenStream tokenStream = TokenSources
|
||||
.getTokenStream(
|
||||
indexReader.getTermVector(0, FIELD),
|
||||
false);
|
||||
indexReader.getTermVector(0, FIELD));
|
||||
assertEquals("<B>the fox</B> did not jump",
|
||||
highlighter.getBestFragment(tokenStream, TEXT));
|
||||
} finally {
|
||||
|
@ -166,8 +168,7 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
new QueryScorer(query));
|
||||
final TokenStream tokenStream = TokenSources
|
||||
.getTokenStream(
|
||||
indexReader.getTermVector(0, FIELD),
|
||||
false);
|
||||
indexReader.getTermVector(0, FIELD));
|
||||
assertEquals("<B>the fox</B> did not jump",
|
||||
highlighter.getBestFragment(tokenStream, TEXT));
|
||||
} finally {
|
||||
|
@ -210,8 +211,7 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
new QueryScorer(phraseQuery));
|
||||
final TokenStream tokenStream = TokenSources
|
||||
.getTokenStream(
|
||||
indexReader.getTermVector(0, FIELD),
|
||||
false);
|
||||
indexReader.getTermVector(0, FIELD));
|
||||
assertEquals("<B>the fox</B> did not jump",
|
||||
highlighter.getBestFragment(tokenStream, TEXT));
|
||||
} finally {
|
||||
|
@ -254,8 +254,7 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
new QueryScorer(phraseQuery));
|
||||
final TokenStream tokenStream = TokenSources
|
||||
.getTokenStream(
|
||||
indexReader.getTermVector(0, FIELD),
|
||||
false);
|
||||
indexReader.getTermVector(0, FIELD));
|
||||
assertEquals("<B>the fox</B> did not jump",
|
||||
highlighter.getBestFragment(tokenStream, TEXT));
|
||||
} finally {
|
||||
|
@ -284,8 +283,7 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
try {
|
||||
assertEquals(1, indexReader.numDocs());
|
||||
TokenSources.getTokenStream(
|
||||
indexReader.getTermVector(0, FIELD),
|
||||
false);
|
||||
indexReader.getTermVector(0, FIELD));
|
||||
fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
|
@ -335,27 +333,98 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
writer.close();
|
||||
assertEquals(1, reader.numDocs());
|
||||
|
||||
for(int i=0;i<2;i++) {
|
||||
// Do this twice, once passing true and then passing
|
||||
// false: they are entirely different code paths
|
||||
// under-the-hood:
|
||||
TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), i == 0);
|
||||
TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
|
||||
|
||||
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
|
||||
PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
|
||||
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
|
||||
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
|
||||
|
||||
for(Token token : tokens) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(token.toString(), termAtt.toString());
|
||||
assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
|
||||
assertEquals(token.getPayload(), payloadAtt.getPayload());
|
||||
assertEquals(token.startOffset(), offsetAtt.startOffset());
|
||||
assertEquals(token.endOffset(), offsetAtt.endOffset());
|
||||
ts.reset();
|
||||
for(Token token : tokens) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(token.toString(), termAtt.toString());
|
||||
assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
|
||||
assertEquals(token.getPayload(), payloadAtt.getPayload());
|
||||
assertEquals(token.startOffset(), offsetAtt.startOffset());
|
||||
assertEquals(token.endOffset(), offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
assertFalse(ts.incrementToken());
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
@Repeat(iterations = 10)
|
||||
//@Seed("947083AB20AB2D4F")
|
||||
public void testRandomizedRoundTrip() throws Exception {
|
||||
final int distinct = TestUtil.nextInt(random(), 1, 10);
|
||||
|
||||
String[] terms = new String[distinct];
|
||||
BytesRef[] termBytes = new BytesRef[distinct];
|
||||
for (int i = 0; i < distinct; ++i) {
|
||||
terms[i] = TestUtil.randomRealisticUnicodeString(random());
|
||||
termBytes[i] = new BytesRef(terms[i]);
|
||||
}
|
||||
|
||||
final BaseTermVectorsFormatTestCase.RandomTokenStream rTokenStream =
|
||||
new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes, false);
|
||||
//check to see if the token streams might have non-deterministic testable result
|
||||
final boolean storeTermVectorPositions = random().nextBoolean();
|
||||
final int[] startOffsets = rTokenStream.getStartOffsets();
|
||||
final int[] positionsIncrements = rTokenStream.getPositionsIncrements();
|
||||
for (int i = 1; i < positionsIncrements.length; i++) {
|
||||
if (storeTermVectorPositions && positionsIncrements[i] != 0) {
|
||||
continue;
|
||||
}
|
||||
//TODO should RandomTokenStream ensure endOffsets for tokens at same position and same startOffset are greater
|
||||
// than previous token's endOffset? That would increase the testable possibilities.
|
||||
if (startOffsets[i] == startOffsets[i-1]) {
|
||||
if (VERBOSE)
|
||||
System.out.println("Skipping test because can't easily validate random token-stream is correct.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
assertFalse(ts.incrementToken());
|
||||
//sanity check itself
|
||||
assertTokenStreamContents(rTokenStream,
|
||||
rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(),
|
||||
rTokenStream.getPositionsIncrements());
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
myFieldType.setStoreTermVectors(true);
|
||||
myFieldType.setStoreTermVectorOffsets(true);
|
||||
myFieldType.setStoreTermVectorPositions(storeTermVectorPositions);
|
||||
//payloads require positions; it will throw an error otherwise
|
||||
myFieldType.setStoreTermVectorPayloads(storeTermVectorPositions && random().nextBoolean());
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", rTokenStream, myFieldType));
|
||||
writer.addDocument(doc);
|
||||
|
||||
IndexReader reader = writer.getReader();
|
||||
writer.close();
|
||||
assertEquals(1, reader.numDocs());
|
||||
|
||||
TokenStream vectorTokenStream = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
|
||||
|
||||
//sometimes check payloads
|
||||
PayloadAttribute payloadAttribute = null;
|
||||
if (myFieldType.storeTermVectorPayloads() && usually()) {
|
||||
payloadAttribute = vectorTokenStream.addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
assertTokenStreamContents(vectorTokenStream,
|
||||
rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(),
|
||||
myFieldType.storeTermVectorPositions() ? rTokenStream.getPositionsIncrements() : null);
|
||||
//test payloads
|
||||
if (payloadAttribute != null) {
|
||||
vectorTokenStream.reset();
|
||||
for (int i = 0; vectorTokenStream.incrementToken(); i++) {
|
||||
assertEquals(rTokenStream.getPayloads()[i], payloadAttribute.getPayload());
|
||||
}
|
||||
}
|
||||
|
||||
reader.close();
|
||||
|
|
|
@ -95,17 +95,6 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
|
|||
return ft;
|
||||
}
|
||||
|
||||
protected BytesRef randomPayload() {
|
||||
final int len = random().nextInt(5);
|
||||
if (len == 0) {
|
||||
return null;
|
||||
}
|
||||
final BytesRef payload = new BytesRef(len);
|
||||
random().nextBytes(payload.bytes);
|
||||
payload.length = len;
|
||||
return payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void addRandomFields(Document doc) {
|
||||
for (Options opts : validOptions()) {
|
||||
|
@ -172,7 +161,9 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
|
|||
}
|
||||
|
||||
// TODO: use CannedTokenStream?
|
||||
protected class RandomTokenStream extends TokenStream {
|
||||
// TODO: pull out and make top-level-utility, separate from TermVectors
|
||||
/** Produces a random TokenStream based off of provided terms. */
|
||||
public static class RandomTokenStream extends TokenStream {
|
||||
|
||||
final String[] terms;
|
||||
final BytesRef[] termBytes;
|
||||
|
@ -191,11 +182,11 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
|
|||
final PayloadAttribute pAtt;
|
||||
int i = 0;
|
||||
|
||||
protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) {
|
||||
public RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) {
|
||||
this(len, sampleTerms, sampleTermBytes, rarely());
|
||||
}
|
||||
|
||||
protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes, boolean offsetsGoBackwards) {
|
||||
public RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes, boolean offsetsGoBackwards) {
|
||||
terms = new String[len];
|
||||
termBytes = new BytesRef[len];
|
||||
positionsIncrements = new int[len];
|
||||
|
@ -266,6 +257,17 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
|
|||
pAtt = addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
protected BytesRef randomPayload() {
|
||||
final int len = random().nextInt(5);
|
||||
if (len == 0) {
|
||||
return null;
|
||||
}
|
||||
final BytesRef payload = new BytesRef(len);
|
||||
random().nextBytes(payload.bytes);
|
||||
payload.length = len;
|
||||
return payload;
|
||||
}
|
||||
|
||||
public boolean hasPayloads() {
|
||||
for (BytesRef payload : payloads) {
|
||||
if (payload != null && payload.length > 0) {
|
||||
|
@ -275,9 +277,40 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
|
|||
return false;
|
||||
}
|
||||
|
||||
public String[] getTerms() {
|
||||
return terms;
|
||||
}
|
||||
|
||||
public BytesRef[] getTermBytes() {
|
||||
return termBytes;
|
||||
}
|
||||
|
||||
public int[] getPositionsIncrements() {
|
||||
return positionsIncrements;
|
||||
}
|
||||
|
||||
public int[] getStartOffsets() {
|
||||
return startOffsets;
|
||||
}
|
||||
|
||||
public int[] getEndOffsets() {
|
||||
return endOffsets;
|
||||
}
|
||||
|
||||
public BytesRef[] getPayloads() {
|
||||
return payloads;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
i = 0;
|
||||
super.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (i < terms.length) {
|
||||
clearAttributes();
|
||||
termAtt.setLength(0).append(terms[i]);
|
||||
piAtt.setPositionIncrement(positionsIncrements[i]);
|
||||
oAtt.setOffset(startOffsets[i], endOffsets[i]);
|
||||
|
|
Loading…
Reference in New Issue