LUCENE-6031: Optimize TokenSources term vector to TokenStream

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1642294 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
David Wayne Smiley 2014-11-28 13:35:19 +00:00
parent 7d4cd4d9bb
commit 6083352aa7
5 changed files with 382 additions and 277 deletions

View File

@ -347,6 +347,11 @@ Optimizations
* LUCENE-6033: CachingTokenFilter now uses ArrayList not LinkedList, and has new
isCached() method. (David Smiley)
* LUCENE-6031: TokenSources (in the default highlighter) converts term vectors into a
TokenStream much faster in linear time (not N*log(N) using less memory, and with reset()
implemented. Only one of offsets or positions are required of the term vector.
(David Smiley)
Build
* LUCENE-5909: Smoke tester now has better command line parsing and

View File

@ -21,24 +21,13 @@ package org.apache.lucene.search.highlight;
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
/**
* Hides implementation issues associated with obtaining a TokenStream for use
@ -113,184 +102,47 @@ public class TokenSources {
return ts;
}
public static TokenStream getTokenStream(Terms vector) throws IOException {
// assumes the worst and makes no assumptions about token position
// sequences.
return getTokenStream(vector, false);
/** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms)} now. */
@Deprecated
public static TokenStream getTokenStream(Terms vector,
boolean tokenPositionsGuaranteedContiguous) throws IOException {
return getTokenStream(vector);
}
/**
* Low level api. Returns a token stream generated from a {@link Terms}. This
* Returns a token stream generated from a {@link Terms}. This
* can be used to feed the highlighter with a pre-parsed token
* stream. The {@link Terms} must have offsets available.
*
* In my tests the speeds to recreate 1000 token streams using this method
* are: - with TermVector offset only data stored - 420 milliseconds - with
* TermVector offset AND position data stored - 271 milliseconds (nb timings
* for TermVector with position data are based on a tokenizer with contiguous
* positions - no overlaps or gaps) The cost of not using TermPositionVector
* to store pre-parsed content and using an analyzer to re-parse the original
* content: - reanalyzing the original content - 980 milliseconds
*
* The re-analyze timings will typically vary depending on - 1) The complexity
* of the analyzer code (timings above were using a
* stemmer/lowercaser/stopword combo) 2) The number of other fields (Lucene
* reads ALL fields off the disk when accessing just one document field - can
* cost dear!) 3) Use of compression on field storage - could be faster due to
* compression (less disk IO) or slower (more CPU burn) depending on the
* content.
*
* @param tokenPositionsGuaranteedContiguous true if the token position
* numbers have no overlaps or gaps. If looking to eek out the last
* drops of performance, set to true. If in doubt, set to false.
* stream. The {@link Terms} must have offsets available. If there are no positions available,
* all tokens will have position increments reflecting adjacent tokens, or coincident when terms
* share a start offset. If there are stopwords filtered from the index, you probably want to ensure
* term vectors have positions so that phrase queries won't match across stopwords.
*
* @throws IllegalArgumentException if no offsets are available
*/
public static TokenStream getTokenStream(Terms tpv,
boolean tokenPositionsGuaranteedContiguous)
throws IOException {
public static TokenStream getTokenStream(final Terms tpv) throws IOException {
if (!tpv.hasOffsets()) {
throw new IllegalArgumentException("Cannot create TokenStream from Terms without offsets");
throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream.");
//TokenStreamFromTermPositionVector can handle a lack of offsets if there are positions. But
// highlighters require offsets, so we insist here.
}
if (!tokenPositionsGuaranteedContiguous && tpv.hasPositions()) {
return new TokenStreamFromTermPositionVector(tpv);
}
// an object used to iterate across an array of tokens
final class StoredTokenStream extends TokenStream {
Token tokens[];
int currentToken = 0;
CharTermAttribute termAtt;
OffsetAttribute offsetAtt;
PositionIncrementAttribute posincAtt;
PayloadAttribute payloadAtt;
StoredTokenStream(Token tokens[]) {
this.tokens = tokens;
termAtt = addAttribute(CharTermAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
posincAtt = addAttribute(PositionIncrementAttribute.class);
payloadAtt = addAttribute(PayloadAttribute.class);
}
@Override
public boolean incrementToken() {
if (currentToken >= tokens.length) {
return false;
}
Token token = tokens[currentToken++];
clearAttributes();
termAtt.setEmpty().append(token);
offsetAtt.setOffset(token.startOffset(), token.endOffset());
BytesRef payload = token.getPayload();
if (payload != null) {
payloadAtt.setPayload(payload);
}
posincAtt
.setPositionIncrement(currentToken <= 1
|| tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
.startOffset() ? 1 : 0);
return true;
}
}
boolean hasPayloads = tpv.hasPayloads();
// code to reconstruct the original sequence of Tokens
TermsEnum termsEnum = tpv.iterator(null);
int totalTokens = 0;
while(termsEnum.next() != null) {
totalTokens += (int) termsEnum.totalTermFreq();
}
Token tokensInOriginalOrder[] = new Token[totalTokens];
ArrayList<Token> unsortedTokens = null;
termsEnum = tpv.iterator(null);
BytesRef text;
DocsAndPositionsEnum dpEnum = null;
while ((text = termsEnum.next()) != null) {
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
if (dpEnum == null) {
throw new IllegalArgumentException(
"Required TermVector Offset information was not found");
}
final String term = text.utf8ToString();
dpEnum.nextDoc();
final int freq = dpEnum.freq();
for(int posUpto=0;posUpto<freq;posUpto++) {
final int pos = dpEnum.nextPosition();
if (dpEnum.startOffset() < 0) {
throw new IllegalArgumentException(
"Required TermVector Offset information was not found");
}
final Token token = new Token(term,
dpEnum.startOffset(),
dpEnum.endOffset());
if (hasPayloads) {
// Must make a deep copy of the returned payload,
// since D&PEnum API is allowed to re-use on every
// call:
token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
}
if (tokenPositionsGuaranteedContiguous && pos != -1) {
// We have positions stored and a guarantee that the token position
// information is contiguous
// This may be fast BUT wont work if Tokenizers used which create >1
// token in same position or
// creates jumps in position numbers - this code would fail under those
// circumstances
// tokens stored with positions - can use this to index straight into
// sorted array
tokensInOriginalOrder[pos] = token;
} else {
// tokens NOT stored with positions or not guaranteed contiguous - must
// add to list and sort later
if (unsortedTokens == null) {
unsortedTokens = new ArrayList<>();
}
unsortedTokens.add(token);
}
}
}
// If the field has been stored without position data we must perform a sort
if (unsortedTokens != null) {
tokensInOriginalOrder = unsortedTokens.toArray(new Token[unsortedTokens
.size()]);
ArrayUtil.timSort(tokensInOriginalOrder, new Comparator<Token>() {
@Override
public int compare(Token t1, Token t2) {
if (t1.startOffset() == t2.startOffset()) {
return t1.endOffset() - t2.endOffset();
} else {
return t1.startOffset() - t2.startOffset();
}
}
});
}
return new StoredTokenStream(tokensInOriginalOrder);
return new TokenStreamFromTermPositionVector(tpv);
}
/**
* Returns a {@link TokenStream} with positions and offsets constructed from
* field termvectors. If the field has no termvectors, or positions or offsets
* are not included in the termvector, return null.
* field termvectors. If the field has no termvectors or offsets
* are not included in the termvector, return null. See {@link #getTokenStream(org.apache.lucene.index.Terms)}
* for an explanation of what happens when positions aren't present.
*
* @param reader the {@link IndexReader} to retrieve term vectors from
* @param docId the document to retrieve termvectors for
* @param field the field to retrieve termvectors for
* @return a {@link TokenStream}, or null if positions and offsets are not available
* @return a {@link TokenStream}, or null if offsets are not available
* @throws IOException If there is a low-level I/O error
*
* @see #getTokenStream(org.apache.lucene.index.Terms)
*/
public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
String field) throws IOException {
@ -305,7 +157,7 @@ public class TokenSources {
return null;
}
if (!vector.hasPositions() || !vector.hasOffsets()) {
if (!vector.hasOffsets()) {
return null;
}

View File

@ -17,12 +17,7 @@ package org.apache.lucene.search.highlight;
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@ -32,106 +27,257 @@ import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.UnicodeUtil;
/**
* TokenStream created from a term vector field.
* TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
* want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
* because you know the term vector has payloads. This TokenStream supports an efficient {@link #reset()}, so there's
* no need to wrap with a caching impl.
* <p />
* The implementation will create an array of tokens indexed by token position. As long as there aren't massive jumps
* in positions, this is fine. And it assumes there aren't large numbers of tokens at the same position, since it adds
* them to a linked-list per position in O(N^2) complexity. When there aren't positions in the term vector, it divides
* the startOffset by 8 to use as a temporary substitute. In that case, tokens with the same startOffset will occupy
* the same final position; otherwise tokens become adjacent.
*
* @lucene.internal
*/
//TODO rename to TokenStreamFromTermVector
public final class TokenStreamFromTermPositionVector extends TokenStream {
private final List<Token> positionedTokens = new ArrayList<>();
//TODO add a maxStartOffset filter, which highlighters will find handy
private Iterator<Token> tokensAtCurrentPosition;
private final Terms vector;
private CharTermAttribute termAttribute;
private final CharTermAttribute termAttribute;
private PositionIncrementAttribute positionIncrementAttribute;
private final PositionIncrementAttribute positionIncrementAttribute;
private OffsetAttribute offsetAttribute;
private OffsetAttribute offsetAttribute;//maybe null
private PayloadAttribute payloadAttribute;
private PayloadAttribute payloadAttribute;//maybe null
private TokenLL firstToken = null; // the head of a linked-list
private TokenLL incrementToken = null;
/**
* Constructor.
*
* @param vector Terms that contains the data for
* creating the TokenStream. Must have positions and offsets.
* creating the TokenStream. Must have positions and/or offsets.
*/
public TokenStreamFromTermPositionVector(
final Terms vector) throws IOException {
public TokenStreamFromTermPositionVector(Terms vector) throws IOException {
if (!vector.hasPositions() && !vector.hasOffsets()) {
throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
}
assert vector.hasFreqs();
this.vector = vector;
termAttribute = addAttribute(CharTermAttribute.class);
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
offsetAttribute = addAttribute(OffsetAttribute.class);
payloadAttribute = addAttribute(PayloadAttribute.class);
final boolean hasOffsets = vector.hasOffsets();
final boolean hasPayloads = vector.hasPayloads();
}
public Terms getTermVectorTerms() { return vector; }
@Override
public void reset() throws IOException {
if (firstToken == null) {//just the first time
init();
}
incrementToken = null;
super.reset();
}
//We initialize in reset() because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
if (vector.hasOffsets()) {
offsetAttribute = addAttribute(OffsetAttribute.class);
}
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
payloadAttribute = getAttribute(PayloadAttribute.class);
}
// Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
TokenLL[] positionedTokens = initTokensArray();
int lastPosition = -1;
final TermsEnum termsEnum = vector.iterator(null);
BytesRef text;
BytesRef termBytesRef;
DocsAndPositionsEnum dpEnum = null;
while((text = termsEnum.next()) != null) {
//int sumFreq = 0;
while ((termBytesRef = termsEnum.next()) != null) {
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
// note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
final char[] termChars = new char[termBytesRef.length];
final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, termChars);
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
dpEnum.nextDoc();
final int freq = dpEnum.freq();
//sumFreq += freq;
for (int j = 0; j < freq; j++) {
int pos = dpEnum.nextPosition();
Token token;
if (hasOffsets) {
token = new Token(text.utf8ToString(),
dpEnum.startOffset(),
dpEnum.endOffset());
} else {
token = new Token();
token.setEmpty().append(text.utf8ToString());
TokenLL token = new TokenLL();
token.termChars = termChars;
token.termCharsLen = termCharsLen;
if (offsetAttribute != null) {
token.startOffset = dpEnum.startOffset();
token.endOffset = dpEnum.endOffset();
if (pos == -1) {
pos = token.startOffset >> 3;//divide by 8
}
}
if (hasPayloads) {
if (payloadAttribute != null) {
// Must make a deep copy of the returned payload,
// since D&PEnum API is allowed to re-use on every
// call:
token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload()));
final BytesRef payload = dpEnum.getPayload();
if (payload != null) {
token.payload = BytesRef.deepCopyOf(payload);//TODO share a ByteBlockPool & re-use BytesRef
}
}
// Yes - this is the position, not the increment! This is for
// sorting. This value
// will be corrected before use.
token.setPositionIncrement(pos);
this.positionedTokens.add(token);
//Add token to an array indexed by position
if (positionedTokens.length <= pos) {
//grow, but not 2x since we think our original length estimate is close
TokenLL[] newPositionedTokens = new TokenLL[(int)((pos + 1) * 1.5f)];
System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
positionedTokens = newPositionedTokens;
}
positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]);
lastPosition = Math.max(lastPosition, pos);
}
}
CollectionUtil.timSort(this.positionedTokens, tokenComparator);
int lastPosition = -1;
for (final Token token : this.positionedTokens) {
int thisPosition = token.getPositionIncrement();
token.setPositionIncrement(thisPosition - lastPosition);
lastPosition = thisPosition;
// System.out.println(String.format(
// "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
// sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
// (originalPositionEstimate/(lastPosition + 1.0f))));
// Step 2: Link all Tokens into a linked-list and set position increments as we go
int prevTokenPos = -1;
TokenLL prevToken = null;
for (int pos = 0; pos <= lastPosition; pos++) {
TokenLL token = positionedTokens[pos];
if (token == null) {
continue;
}
//link
if (prevToken != null) {
assert prevToken.next == null;
prevToken.next = token; //concatenate linked-list
} else {
assert firstToken == null;
firstToken = token;
}
//set increments
if (vector.hasPositions()) {
token.positionIncrement = pos - prevTokenPos;
while (token.next != null) {
token = token.next;
token.positionIncrement = 0;
}
} else {
token.positionIncrement = 1;
while (token.next != null) {
prevToken = token;
token = token.next;
if (prevToken.startOffset == token.startOffset) {
token.positionIncrement = 0;
} else {
token.positionIncrement = 1;
}
}
}
prevTokenPos = pos;
prevToken = token;
}
this.tokensAtCurrentPosition = this.positionedTokens.iterator();
}
private static final Comparator<Token> tokenComparator = new Comparator<Token>() {
@Override
public int compare(final Token o1, final Token o2) {
return o1.getPositionIncrement() - o2.getPositionIncrement();
private TokenLL[] initTokensArray() throws IOException {
// Estimate the number of position slots we need. We use some estimation factors taken from Wikipedia
// that reduce the likelihood of needing to expand the array.
int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
int size = (int) vector.size();
if (size == -1) {//doesn't happen with term vectors, it seems, but pick a default any way
size = 128;
}
sumTotalTermFreq = (int)(size * 2.4);
}
};
final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
return new TokenLL[originalPositionEstimate];
}
@Override
public boolean incrementToken() {
if (this.tokensAtCurrentPosition.hasNext()) {
final Token next = this.tokensAtCurrentPosition.next();
clearAttributes();
termAttribute.setEmpty().append(next);
positionIncrementAttribute.setPositionIncrement(next
.getPositionIncrement());
offsetAttribute.setOffset(next.startOffset(), next.endOffset());
payloadAttribute.setPayload(next.getPayload());
return true;
if (incrementToken == null) {
incrementToken = firstToken;
if (incrementToken == null) {
return false;
}
} else if (incrementToken.next != null) {
incrementToken = incrementToken.next;
} else {
return false;
}
return false;
clearAttributes();
termAttribute.copyBuffer(incrementToken.termChars, 0, incrementToken.termCharsLen);
positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement);
if (offsetAttribute != null) {
offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset);
}
if (payloadAttribute != null) {
payloadAttribute.setPayload(incrementToken.payload);
}
return true;
}
@Override
public void reset() {
this.tokensAtCurrentPosition = this.positionedTokens.iterator();
private static class TokenLL {
char[] termChars;
int termCharsLen;
int positionIncrement;
int startOffset;
int endOffset;
BytesRef payload;
TokenLL next;
/** Given the head of a linked-list (possibly null) this inserts the token at the correct
* spot to maintain the desired order, and returns the head (which could be this token if it's the smallest).
* O(N^2) complexity but N should be a handful at most.
*/
TokenLL insertIntoSortedLinkedList(final TokenLL head) {
assert next == null;
if (head == null) {
return this;
} else if (this.compareOffsets(head) <= 0) {
this.next = head;
return this;
}
TokenLL prev = head;
while (prev.next != null && this.compareOffsets(prev.next) > 0) {
prev = prev.next;
}
this.next = prev.next;
prev.next = this;
return head;
}
/** by startOffset then endOffset */
int compareOffsets(TokenLL tokenB) {
int cmp = Integer.compare(this.startOffset, tokenB.startOffset);
if (cmp == 0) {
cmp = Integer.compare(this.endOffset, tokenB.endOffset);
}
return cmp;
}
}
}

View File

@ -19,6 +19,8 @@ package org.apache.lucene.search.highlight;
import java.io.IOException;
import com.carrotsearch.randomizedtesting.annotations.Repeat;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
@ -30,6 +32,7 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
@ -44,10 +47,10 @@ import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
// LUCENE-2874
public class TokenSourcesTest extends LuceneTestCase {
public class TokenSourcesTest extends BaseTokenStreamTestCase {
private static final String FIELD = "text";
private static final class OverlappingTokenStream extends TokenStream {
@ -121,8 +124,7 @@ public class TokenSourcesTest extends LuceneTestCase {
new QueryScorer(query));
final TokenStream tokenStream = TokenSources
.getTokenStream(
indexReader.getTermVector(0, FIELD),
false);
indexReader.getTermVector(0, FIELD));
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
} finally {
@ -166,8 +168,7 @@ public class TokenSourcesTest extends LuceneTestCase {
new QueryScorer(query));
final TokenStream tokenStream = TokenSources
.getTokenStream(
indexReader.getTermVector(0, FIELD),
false);
indexReader.getTermVector(0, FIELD));
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
} finally {
@ -210,8 +211,7 @@ public class TokenSourcesTest extends LuceneTestCase {
new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources
.getTokenStream(
indexReader.getTermVector(0, FIELD),
false);
indexReader.getTermVector(0, FIELD));
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
} finally {
@ -254,8 +254,7 @@ public class TokenSourcesTest extends LuceneTestCase {
new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources
.getTokenStream(
indexReader.getTermVector(0, FIELD),
false);
indexReader.getTermVector(0, FIELD));
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
} finally {
@ -284,8 +283,7 @@ public class TokenSourcesTest extends LuceneTestCase {
try {
assertEquals(1, indexReader.numDocs());
TokenSources.getTokenStream(
indexReader.getTermVector(0, FIELD),
false);
indexReader.getTermVector(0, FIELD));
fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
}
catch (IllegalArgumentException e) {
@ -335,27 +333,98 @@ public class TokenSourcesTest extends LuceneTestCase {
writer.close();
assertEquals(1, reader.numDocs());
for(int i=0;i<2;i++) {
// Do this twice, once passing true and then passing
// false: they are entirely different code paths
// under-the-hood:
TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), i == 0);
TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
for(Token token : tokens) {
assertTrue(ts.incrementToken());
assertEquals(token.toString(), termAtt.toString());
assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
assertEquals(token.getPayload(), payloadAtt.getPayload());
assertEquals(token.startOffset(), offsetAtt.startOffset());
assertEquals(token.endOffset(), offsetAtt.endOffset());
ts.reset();
for(Token token : tokens) {
assertTrue(ts.incrementToken());
assertEquals(token.toString(), termAtt.toString());
assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
assertEquals(token.getPayload(), payloadAtt.getPayload());
assertEquals(token.startOffset(), offsetAtt.startOffset());
assertEquals(token.endOffset(), offsetAtt.endOffset());
}
assertFalse(ts.incrementToken());
reader.close();
dir.close();
}
@Repeat(iterations = 10)
//@Seed("947083AB20AB2D4F")
public void testRandomizedRoundTrip() throws Exception {
final int distinct = TestUtil.nextInt(random(), 1, 10);
String[] terms = new String[distinct];
BytesRef[] termBytes = new BytesRef[distinct];
for (int i = 0; i < distinct; ++i) {
terms[i] = TestUtil.randomRealisticUnicodeString(random());
termBytes[i] = new BytesRef(terms[i]);
}
final BaseTermVectorsFormatTestCase.RandomTokenStream rTokenStream =
new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes, false);
//check to see if the token streams might have non-deterministic testable result
final boolean storeTermVectorPositions = random().nextBoolean();
final int[] startOffsets = rTokenStream.getStartOffsets();
final int[] positionsIncrements = rTokenStream.getPositionsIncrements();
for (int i = 1; i < positionsIncrements.length; i++) {
if (storeTermVectorPositions && positionsIncrements[i] != 0) {
continue;
}
//TODO should RandomTokenStream ensure endOffsets for tokens at same position and same startOffset are greater
// than previous token's endOffset? That would increase the testable possibilities.
if (startOffsets[i] == startOffsets[i-1]) {
if (VERBOSE)
System.out.println("Skipping test because can't easily validate random token-stream is correct.");
return;
}
}
assertFalse(ts.incrementToken());
//sanity check itself
assertTokenStreamContents(rTokenStream,
rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(),
rTokenStream.getPositionsIncrements());
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
myFieldType.setStoreTermVectors(true);
myFieldType.setStoreTermVectorOffsets(true);
myFieldType.setStoreTermVectorPositions(storeTermVectorPositions);
//payloads require positions; it will throw an error otherwise
myFieldType.setStoreTermVectorPayloads(storeTermVectorPositions && random().nextBoolean());
Document doc = new Document();
doc.add(new Field("field", rTokenStream, myFieldType));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
writer.close();
assertEquals(1, reader.numDocs());
TokenStream vectorTokenStream = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
//sometimes check payloads
PayloadAttribute payloadAttribute = null;
if (myFieldType.storeTermVectorPayloads() && usually()) {
payloadAttribute = vectorTokenStream.addAttribute(PayloadAttribute.class);
}
assertTokenStreamContents(vectorTokenStream,
rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(),
myFieldType.storeTermVectorPositions() ? rTokenStream.getPositionsIncrements() : null);
//test payloads
if (payloadAttribute != null) {
vectorTokenStream.reset();
for (int i = 0; vectorTokenStream.incrementToken(); i++) {
assertEquals(rTokenStream.getPayloads()[i], payloadAttribute.getPayload());
}
}
reader.close();

View File

@ -95,17 +95,6 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
return ft;
}
protected BytesRef randomPayload() {
final int len = random().nextInt(5);
if (len == 0) {
return null;
}
final BytesRef payload = new BytesRef(len);
random().nextBytes(payload.bytes);
payload.length = len;
return payload;
}
@Override
protected void addRandomFields(Document doc) {
for (Options opts : validOptions()) {
@ -172,7 +161,9 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
}
// TODO: use CannedTokenStream?
protected class RandomTokenStream extends TokenStream {
// TODO: pull out and make top-level-utility, separate from TermVectors
/** Produces a random TokenStream based off of provided terms. */
public static class RandomTokenStream extends TokenStream {
final String[] terms;
final BytesRef[] termBytes;
@ -191,11 +182,11 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
final PayloadAttribute pAtt;
int i = 0;
protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) {
public RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) {
this(len, sampleTerms, sampleTermBytes, rarely());
}
protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes, boolean offsetsGoBackwards) {
public RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes, boolean offsetsGoBackwards) {
terms = new String[len];
termBytes = new BytesRef[len];
positionsIncrements = new int[len];
@ -266,6 +257,17 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
pAtt = addAttribute(PayloadAttribute.class);
}
protected BytesRef randomPayload() {
final int len = random().nextInt(5);
if (len == 0) {
return null;
}
final BytesRef payload = new BytesRef(len);
random().nextBytes(payload.bytes);
payload.length = len;
return payload;
}
public boolean hasPayloads() {
for (BytesRef payload : payloads) {
if (payload != null && payload.length > 0) {
@ -275,9 +277,40 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
return false;
}
public String[] getTerms() {
return terms;
}
public BytesRef[] getTermBytes() {
return termBytes;
}
public int[] getPositionsIncrements() {
return positionsIncrements;
}
public int[] getStartOffsets() {
return startOffsets;
}
public int[] getEndOffsets() {
return endOffsets;
}
public BytesRef[] getPayloads() {
return payloads;
}
@Override
public void reset() throws IOException {
i = 0;
super.reset();
}
@Override
public final boolean incrementToken() throws IOException {
if (i < terms.length) {
clearAttributes();
termAtt.setLength(0).append(terms[i]);
piAtt.setPositionIncrement(positionsIncrements[i]);
oAtt.setOffset(startOffsets[i], endOffsets[i]);