diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 261f5ad3b44..870cf2a8255 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -347,6 +347,11 @@ Optimizations * LUCENE-6033: CachingTokenFilter now uses ArrayList not LinkedList, and has new isCached() method. (David Smiley) +* LUCENE-6031: TokenSources (in the default highlighter) converts term vectors into a + TokenStream much faster in linear time (not N*log(N) using less memory, and with reset() + implemented. Only one of offsets or positions are required of the term vector. + (David Smiley) + Build * LUCENE-5909: Smoke tester now has better command line parsing and diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java index 68433a9425f..42f568b8f40 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java @@ -21,24 +21,13 @@ package org.apache.lucene.search.highlight; */ import java.io.IOException; -import java.util.ArrayList; -import java.util.Comparator; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.StoredDocument; import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BytesRef; /** * Hides implementation issues associated with obtaining a TokenStream for use @@ -113,184 +102,47 @@ public class TokenSources { return ts; } - public static TokenStream getTokenStream(Terms vector) throws IOException { - // assumes the worst and makes no assumptions about token position - // sequences. - return getTokenStream(vector, false); + /** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms)} now. */ + @Deprecated + public static TokenStream getTokenStream(Terms vector, + boolean tokenPositionsGuaranteedContiguous) throws IOException { + return getTokenStream(vector); } /** - * Low level api. Returns a token stream generated from a {@link Terms}. This + * Returns a token stream generated from a {@link Terms}. This * can be used to feed the highlighter with a pre-parsed token - * stream. The {@link Terms} must have offsets available. - * - * In my tests the speeds to recreate 1000 token streams using this method - * are: - with TermVector offset only data stored - 420 milliseconds - with - * TermVector offset AND position data stored - 271 milliseconds (nb timings - * for TermVector with position data are based on a tokenizer with contiguous - * positions - no overlaps or gaps) The cost of not using TermPositionVector - * to store pre-parsed content and using an analyzer to re-parse the original - * content: - reanalyzing the original content - 980 milliseconds - * - * The re-analyze timings will typically vary depending on - 1) The complexity - * of the analyzer code (timings above were using a - * stemmer/lowercaser/stopword combo) 2) The number of other fields (Lucene - * reads ALL fields off the disk when accessing just one document field - can - * cost dear!) 3) Use of compression on field storage - could be faster due to - * compression (less disk IO) or slower (more CPU burn) depending on the - * content. - * - * @param tokenPositionsGuaranteedContiguous true if the token position - * numbers have no overlaps or gaps. If looking to eek out the last - * drops of performance, set to true. If in doubt, set to false. + * stream. The {@link Terms} must have offsets available. If there are no positions available, + * all tokens will have position increments reflecting adjacent tokens, or coincident when terms + * share a start offset. If there are stopwords filtered from the index, you probably want to ensure + * term vectors have positions so that phrase queries won't match across stopwords. * * @throws IllegalArgumentException if no offsets are available */ - public static TokenStream getTokenStream(Terms tpv, - boolean tokenPositionsGuaranteedContiguous) - throws IOException { + public static TokenStream getTokenStream(final Terms tpv) throws IOException { if (!tpv.hasOffsets()) { - throw new IllegalArgumentException("Cannot create TokenStream from Terms without offsets"); + throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream."); + //TokenStreamFromTermPositionVector can handle a lack of offsets if there are positions. But + // highlighters require offsets, so we insist here. } - if (!tokenPositionsGuaranteedContiguous && tpv.hasPositions()) { - return new TokenStreamFromTermPositionVector(tpv); - } - - // an object used to iterate across an array of tokens - final class StoredTokenStream extends TokenStream { - Token tokens[]; - - int currentToken = 0; - - CharTermAttribute termAtt; - - OffsetAttribute offsetAtt; - - PositionIncrementAttribute posincAtt; - - PayloadAttribute payloadAtt; - - StoredTokenStream(Token tokens[]) { - this.tokens = tokens; - termAtt = addAttribute(CharTermAttribute.class); - offsetAtt = addAttribute(OffsetAttribute.class); - posincAtt = addAttribute(PositionIncrementAttribute.class); - payloadAtt = addAttribute(PayloadAttribute.class); - } - - @Override - public boolean incrementToken() { - if (currentToken >= tokens.length) { - return false; - } - Token token = tokens[currentToken++]; - clearAttributes(); - termAtt.setEmpty().append(token); - offsetAtt.setOffset(token.startOffset(), token.endOffset()); - BytesRef payload = token.getPayload(); - if (payload != null) { - payloadAtt.setPayload(payload); - } - posincAtt - .setPositionIncrement(currentToken <= 1 - || tokens[currentToken - 1].startOffset() > tokens[currentToken - 2] - .startOffset() ? 1 : 0); - return true; - } - } - - boolean hasPayloads = tpv.hasPayloads(); - - // code to reconstruct the original sequence of Tokens - TermsEnum termsEnum = tpv.iterator(null); - int totalTokens = 0; - while(termsEnum.next() != null) { - totalTokens += (int) termsEnum.totalTermFreq(); - } - Token tokensInOriginalOrder[] = new Token[totalTokens]; - ArrayList unsortedTokens = null; - termsEnum = tpv.iterator(null); - BytesRef text; - DocsAndPositionsEnum dpEnum = null; - while ((text = termsEnum.next()) != null) { - - dpEnum = termsEnum.docsAndPositions(null, dpEnum); - if (dpEnum == null) { - throw new IllegalArgumentException( - "Required TermVector Offset information was not found"); - } - final String term = text.utf8ToString(); - - dpEnum.nextDoc(); - final int freq = dpEnum.freq(); - for(int posUpto=0;posUpto1 - // token in same position or - // creates jumps in position numbers - this code would fail under those - // circumstances - - // tokens stored with positions - can use this to index straight into - // sorted array - tokensInOriginalOrder[pos] = token; - } else { - // tokens NOT stored with positions or not guaranteed contiguous - must - // add to list and sort later - if (unsortedTokens == null) { - unsortedTokens = new ArrayList<>(); - } - unsortedTokens.add(token); - } - } - } - - // If the field has been stored without position data we must perform a sort - if (unsortedTokens != null) { - tokensInOriginalOrder = unsortedTokens.toArray(new Token[unsortedTokens - .size()]); - ArrayUtil.timSort(tokensInOriginalOrder, new Comparator() { - @Override - public int compare(Token t1, Token t2) { - if (t1.startOffset() == t2.startOffset()) { - return t1.endOffset() - t2.endOffset(); - } else { - return t1.startOffset() - t2.startOffset(); - } - } - }); - } - return new StoredTokenStream(tokensInOriginalOrder); + return new TokenStreamFromTermPositionVector(tpv); } /** * Returns a {@link TokenStream} with positions and offsets constructed from - * field termvectors. If the field has no termvectors, or positions or offsets - * are not included in the termvector, return null. + * field termvectors. If the field has no termvectors or offsets + * are not included in the termvector, return null. See {@link #getTokenStream(org.apache.lucene.index.Terms)} + * for an explanation of what happens when positions aren't present. + * * @param reader the {@link IndexReader} to retrieve term vectors from * @param docId the document to retrieve termvectors for * @param field the field to retrieve termvectors for - * @return a {@link TokenStream}, or null if positions and offsets are not available + * @return a {@link TokenStream}, or null if offsets are not available * @throws IOException If there is a low-level I/O error + * + * @see #getTokenStream(org.apache.lucene.index.Terms) */ public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId, String field) throws IOException { @@ -305,7 +157,7 @@ public class TokenSources { return null; } - if (!vector.hasPositions() || !vector.hasOffsets()) { + if (!vector.hasOffsets()) { return null; } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java index bd87206736a..982ce4a5b02 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java @@ -17,12 +17,7 @@ package org.apache.lucene.search.highlight; * limitations under the License. */ import java.io.IOException; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -32,106 +27,257 @@ import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CollectionUtil; +import org.apache.lucene.util.UnicodeUtil; /** - * TokenStream created from a term vector field. + * TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you + * want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just + * because you know the term vector has payloads. This TokenStream supports an efficient {@link #reset()}, so there's + * no need to wrap with a caching impl. + *

+ * The implementation will create an array of tokens indexed by token position. As long as there aren't massive jumps + * in positions, this is fine. And it assumes there aren't large numbers of tokens at the same position, since it adds + * them to a linked-list per position in O(N^2) complexity. When there aren't positions in the term vector, it divides + * the startOffset by 8 to use as a temporary substitute. In that case, tokens with the same startOffset will occupy + * the same final position; otherwise tokens become adjacent. + * + * @lucene.internal */ +//TODO rename to TokenStreamFromTermVector public final class TokenStreamFromTermPositionVector extends TokenStream { - private final List positionedTokens = new ArrayList<>(); + //TODO add a maxStartOffset filter, which highlighters will find handy - private Iterator tokensAtCurrentPosition; + private final Terms vector; - private CharTermAttribute termAttribute; + private final CharTermAttribute termAttribute; - private PositionIncrementAttribute positionIncrementAttribute; + private final PositionIncrementAttribute positionIncrementAttribute; - private OffsetAttribute offsetAttribute; + private OffsetAttribute offsetAttribute;//maybe null - private PayloadAttribute payloadAttribute; + private PayloadAttribute payloadAttribute;//maybe null + + private TokenLL firstToken = null; // the head of a linked-list + + private TokenLL incrementToken = null; /** * Constructor. * * @param vector Terms that contains the data for - * creating the TokenStream. Must have positions and offsets. + * creating the TokenStream. Must have positions and/or offsets. */ - public TokenStreamFromTermPositionVector( - final Terms vector) throws IOException { + public TokenStreamFromTermPositionVector(Terms vector) throws IOException { + if (!vector.hasPositions() && !vector.hasOffsets()) { + throw new IllegalArgumentException("The term vector needs positions and/or offsets."); + } + assert vector.hasFreqs(); + this.vector = vector; termAttribute = addAttribute(CharTermAttribute.class); positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); - offsetAttribute = addAttribute(OffsetAttribute.class); - payloadAttribute = addAttribute(PayloadAttribute.class); - final boolean hasOffsets = vector.hasOffsets(); - final boolean hasPayloads = vector.hasPayloads(); + } + + public Terms getTermVectorTerms() { return vector; } + + @Override + public void reset() throws IOException { + if (firstToken == null) {//just the first time + init(); + } + incrementToken = null; + super.reset(); + } + + //We initialize in reset() because we can see which attributes the consumer wants, particularly payloads + private void init() throws IOException { + if (vector.hasOffsets()) { + offsetAttribute = addAttribute(OffsetAttribute.class); + } + if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) { + payloadAttribute = getAttribute(PayloadAttribute.class); + } + + // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position + + TokenLL[] positionedTokens = initTokensArray(); + + int lastPosition = -1; + final TermsEnum termsEnum = vector.iterator(null); - BytesRef text; + BytesRef termBytesRef; DocsAndPositionsEnum dpEnum = null; - while((text = termsEnum.next()) != null) { + //int sumFreq = 0; + while ((termBytesRef = termsEnum.next()) != null) { + //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj) + // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand + final char[] termChars = new char[termBytesRef.length]; + final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, termChars); + dpEnum = termsEnum.docsAndPositions(null, dpEnum); assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier dpEnum.nextDoc(); final int freq = dpEnum.freq(); + //sumFreq += freq; for (int j = 0; j < freq; j++) { int pos = dpEnum.nextPosition(); - Token token; - if (hasOffsets) { - token = new Token(text.utf8ToString(), - dpEnum.startOffset(), - dpEnum.endOffset()); - } else { - token = new Token(); - token.setEmpty().append(text.utf8ToString()); + TokenLL token = new TokenLL(); + token.termChars = termChars; + token.termCharsLen = termCharsLen; + if (offsetAttribute != null) { + token.startOffset = dpEnum.startOffset(); + token.endOffset = dpEnum.endOffset(); + if (pos == -1) { + pos = token.startOffset >> 3;//divide by 8 + } } - if (hasPayloads) { + + if (payloadAttribute != null) { // Must make a deep copy of the returned payload, // since D&PEnum API is allowed to re-use on every // call: - token.setPayload(BytesRef.deepCopyOf(dpEnum.getPayload())); + final BytesRef payload = dpEnum.getPayload(); + if (payload != null) { + token.payload = BytesRef.deepCopyOf(payload);//TODO share a ByteBlockPool & re-use BytesRef + } } - // Yes - this is the position, not the increment! This is for - // sorting. This value - // will be corrected before use. - token.setPositionIncrement(pos); - this.positionedTokens.add(token); + //Add token to an array indexed by position + if (positionedTokens.length <= pos) { + //grow, but not 2x since we think our original length estimate is close + TokenLL[] newPositionedTokens = new TokenLL[(int)((pos + 1) * 1.5f)]; + System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1); + positionedTokens = newPositionedTokens; + } + positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]); + + lastPosition = Math.max(lastPosition, pos); } } - CollectionUtil.timSort(this.positionedTokens, tokenComparator); - int lastPosition = -1; - for (final Token token : this.positionedTokens) { - int thisPosition = token.getPositionIncrement(); - token.setPositionIncrement(thisPosition - lastPosition); - lastPosition = thisPosition; + +// System.out.println(String.format( +// "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f", +// sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq, +// (originalPositionEstimate/(lastPosition + 1.0f)))); + + // Step 2: Link all Tokens into a linked-list and set position increments as we go + + int prevTokenPos = -1; + TokenLL prevToken = null; + for (int pos = 0; pos <= lastPosition; pos++) { + TokenLL token = positionedTokens[pos]; + if (token == null) { + continue; + } + //link + if (prevToken != null) { + assert prevToken.next == null; + prevToken.next = token; //concatenate linked-list + } else { + assert firstToken == null; + firstToken = token; + } + //set increments + if (vector.hasPositions()) { + token.positionIncrement = pos - prevTokenPos; + while (token.next != null) { + token = token.next; + token.positionIncrement = 0; + } + } else { + token.positionIncrement = 1; + while (token.next != null) { + prevToken = token; + token = token.next; + if (prevToken.startOffset == token.startOffset) { + token.positionIncrement = 0; + } else { + token.positionIncrement = 1; + } + } + } + prevTokenPos = pos; + prevToken = token; } - this.tokensAtCurrentPosition = this.positionedTokens.iterator(); } - private static final Comparator tokenComparator = new Comparator() { - @Override - public int compare(final Token o1, final Token o2) { - return o1.getPositionIncrement() - o2.getPositionIncrement(); + private TokenLL[] initTokensArray() throws IOException { + // Estimate the number of position slots we need. We use some estimation factors taken from Wikipedia + // that reduce the likelihood of needing to expand the array. + int sumTotalTermFreq = (int) vector.getSumTotalTermFreq(); + if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat + int size = (int) vector.size(); + if (size == -1) {//doesn't happen with term vectors, it seems, but pick a default any way + size = 128; + } + sumTotalTermFreq = (int)(size * 2.4); } - }; - + final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this + return new TokenLL[originalPositionEstimate]; + } + @Override public boolean incrementToken() { - if (this.tokensAtCurrentPosition.hasNext()) { - final Token next = this.tokensAtCurrentPosition.next(); - clearAttributes(); - termAttribute.setEmpty().append(next); - positionIncrementAttribute.setPositionIncrement(next - .getPositionIncrement()); - offsetAttribute.setOffset(next.startOffset(), next.endOffset()); - payloadAttribute.setPayload(next.getPayload()); - return true; + if (incrementToken == null) { + incrementToken = firstToken; + if (incrementToken == null) { + return false; + } + } else if (incrementToken.next != null) { + incrementToken = incrementToken.next; + } else { + return false; } - return false; + clearAttributes(); + termAttribute.copyBuffer(incrementToken.termChars, 0, incrementToken.termCharsLen); + positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement); + if (offsetAttribute != null) { + offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset); + } + if (payloadAttribute != null) { + payloadAttribute.setPayload(incrementToken.payload); + } + return true; } - @Override - public void reset() { - this.tokensAtCurrentPosition = this.positionedTokens.iterator(); + private static class TokenLL { + char[] termChars; + int termCharsLen; + int positionIncrement; + int startOffset; + int endOffset; + BytesRef payload; + + TokenLL next; + + /** Given the head of a linked-list (possibly null) this inserts the token at the correct + * spot to maintain the desired order, and returns the head (which could be this token if it's the smallest). + * O(N^2) complexity but N should be a handful at most. + */ + TokenLL insertIntoSortedLinkedList(final TokenLL head) { + assert next == null; + if (head == null) { + return this; + } else if (this.compareOffsets(head) <= 0) { + this.next = head; + return this; + } + TokenLL prev = head; + while (prev.next != null && this.compareOffsets(prev.next) > 0) { + prev = prev.next; + } + this.next = prev.next; + prev.next = this; + return head; + } + + /** by startOffset then endOffset */ + int compareOffsets(TokenLL tokenB) { + int cmp = Integer.compare(this.startOffset, tokenB.startOffset); + if (cmp == 0) { + cmp = Integer.compare(this.endOffset, tokenB.endOffset); + } + return cmp; + } } } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java index 6d68ea22cd1..b2312034a4e 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java @@ -19,6 +19,8 @@ package org.apache.lucene.search.highlight; import java.io.IOException; +import com.carrotsearch.randomizedtesting.annotations.Repeat; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; @@ -30,6 +32,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.BaseTermVectorsFormatTestCase; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; @@ -44,10 +47,10 @@ import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; // LUCENE-2874 -public class TokenSourcesTest extends LuceneTestCase { +public class TokenSourcesTest extends BaseTokenStreamTestCase { private static final String FIELD = "text"; private static final class OverlappingTokenStream extends TokenStream { @@ -121,8 +124,7 @@ public class TokenSourcesTest extends LuceneTestCase { new QueryScorer(query)); final TokenStream tokenStream = TokenSources .getTokenStream( - indexReader.getTermVector(0, FIELD), - false); + indexReader.getTermVector(0, FIELD)); assertEquals("the fox did not jump", highlighter.getBestFragment(tokenStream, TEXT)); } finally { @@ -166,8 +168,7 @@ public class TokenSourcesTest extends LuceneTestCase { new QueryScorer(query)); final TokenStream tokenStream = TokenSources .getTokenStream( - indexReader.getTermVector(0, FIELD), - false); + indexReader.getTermVector(0, FIELD)); assertEquals("the fox did not jump", highlighter.getBestFragment(tokenStream, TEXT)); } finally { @@ -210,8 +211,7 @@ public class TokenSourcesTest extends LuceneTestCase { new QueryScorer(phraseQuery)); final TokenStream tokenStream = TokenSources .getTokenStream( - indexReader.getTermVector(0, FIELD), - false); + indexReader.getTermVector(0, FIELD)); assertEquals("the fox did not jump", highlighter.getBestFragment(tokenStream, TEXT)); } finally { @@ -254,8 +254,7 @@ public class TokenSourcesTest extends LuceneTestCase { new QueryScorer(phraseQuery)); final TokenStream tokenStream = TokenSources .getTokenStream( - indexReader.getTermVector(0, FIELD), - false); + indexReader.getTermVector(0, FIELD)); assertEquals("the fox did not jump", highlighter.getBestFragment(tokenStream, TEXT)); } finally { @@ -284,8 +283,7 @@ public class TokenSourcesTest extends LuceneTestCase { try { assertEquals(1, indexReader.numDocs()); TokenSources.getTokenStream( - indexReader.getTermVector(0, FIELD), - false); + indexReader.getTermVector(0, FIELD)); fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets"); } catch (IllegalArgumentException e) { @@ -335,27 +333,98 @@ public class TokenSourcesTest extends LuceneTestCase { writer.close(); assertEquals(1, reader.numDocs()); - for(int i=0;i<2;i++) { - // Do this twice, once passing true and then passing - // false: they are entirely different code paths - // under-the-hood: - TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), i == 0); + TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field")); - CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class); - OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class); - PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class); + CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); + PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class); + OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class); + PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); - for(Token token : tokens) { - assertTrue(ts.incrementToken()); - assertEquals(token.toString(), termAtt.toString()); - assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement()); - assertEquals(token.getPayload(), payloadAtt.getPayload()); - assertEquals(token.startOffset(), offsetAtt.startOffset()); - assertEquals(token.endOffset(), offsetAtt.endOffset()); + ts.reset(); + for(Token token : tokens) { + assertTrue(ts.incrementToken()); + assertEquals(token.toString(), termAtt.toString()); + assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement()); + assertEquals(token.getPayload(), payloadAtt.getPayload()); + assertEquals(token.startOffset(), offsetAtt.startOffset()); + assertEquals(token.endOffset(), offsetAtt.endOffset()); + } + + assertFalse(ts.incrementToken()); + + reader.close(); + dir.close(); + } + + @Repeat(iterations = 10) + //@Seed("947083AB20AB2D4F") + public void testRandomizedRoundTrip() throws Exception { + final int distinct = TestUtil.nextInt(random(), 1, 10); + + String[] terms = new String[distinct]; + BytesRef[] termBytes = new BytesRef[distinct]; + for (int i = 0; i < distinct; ++i) { + terms[i] = TestUtil.randomRealisticUnicodeString(random()); + termBytes[i] = new BytesRef(terms[i]); + } + + final BaseTermVectorsFormatTestCase.RandomTokenStream rTokenStream = + new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes, false); + //check to see if the token streams might have non-deterministic testable result + final boolean storeTermVectorPositions = random().nextBoolean(); + final int[] startOffsets = rTokenStream.getStartOffsets(); + final int[] positionsIncrements = rTokenStream.getPositionsIncrements(); + for (int i = 1; i < positionsIncrements.length; i++) { + if (storeTermVectorPositions && positionsIncrements[i] != 0) { + continue; } + //TODO should RandomTokenStream ensure endOffsets for tokens at same position and same startOffset are greater + // than previous token's endOffset? That would increase the testable possibilities. + if (startOffsets[i] == startOffsets[i-1]) { + if (VERBOSE) + System.out.println("Skipping test because can't easily validate random token-stream is correct."); + return; + } + } - assertFalse(ts.incrementToken()); + //sanity check itself + assertTokenStreamContents(rTokenStream, + rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(), + rTokenStream.getPositionsIncrements()); + + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED); + myFieldType.setStoreTermVectors(true); + myFieldType.setStoreTermVectorOffsets(true); + myFieldType.setStoreTermVectorPositions(storeTermVectorPositions); + //payloads require positions; it will throw an error otherwise + myFieldType.setStoreTermVectorPayloads(storeTermVectorPositions && random().nextBoolean()); + + Document doc = new Document(); + doc.add(new Field("field", rTokenStream, myFieldType)); + writer.addDocument(doc); + + IndexReader reader = writer.getReader(); + writer.close(); + assertEquals(1, reader.numDocs()); + + TokenStream vectorTokenStream = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field")); + + //sometimes check payloads + PayloadAttribute payloadAttribute = null; + if (myFieldType.storeTermVectorPayloads() && usually()) { + payloadAttribute = vectorTokenStream.addAttribute(PayloadAttribute.class); + } + assertTokenStreamContents(vectorTokenStream, + rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(), + myFieldType.storeTermVectorPositions() ? rTokenStream.getPositionsIncrements() : null); + //test payloads + if (payloadAttribute != null) { + vectorTokenStream.reset(); + for (int i = 0; vectorTokenStream.incrementToken(); i++) { + assertEquals(rTokenStream.getPayloads()[i], payloadAttribute.getPayload()); + } } reader.close(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java index ef08e9df467..16655ea1b81 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java @@ -95,17 +95,6 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT return ft; } - protected BytesRef randomPayload() { - final int len = random().nextInt(5); - if (len == 0) { - return null; - } - final BytesRef payload = new BytesRef(len); - random().nextBytes(payload.bytes); - payload.length = len; - return payload; - } - @Override protected void addRandomFields(Document doc) { for (Options opts : validOptions()) { @@ -172,7 +161,9 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT } // TODO: use CannedTokenStream? - protected class RandomTokenStream extends TokenStream { + // TODO: pull out and make top-level-utility, separate from TermVectors + /** Produces a random TokenStream based off of provided terms. */ + public static class RandomTokenStream extends TokenStream { final String[] terms; final BytesRef[] termBytes; @@ -191,11 +182,11 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT final PayloadAttribute pAtt; int i = 0; - protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) { + public RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) { this(len, sampleTerms, sampleTermBytes, rarely()); } - protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes, boolean offsetsGoBackwards) { + public RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes, boolean offsetsGoBackwards) { terms = new String[len]; termBytes = new BytesRef[len]; positionsIncrements = new int[len]; @@ -266,6 +257,17 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT pAtt = addAttribute(PayloadAttribute.class); } + protected BytesRef randomPayload() { + final int len = random().nextInt(5); + if (len == 0) { + return null; + } + final BytesRef payload = new BytesRef(len); + random().nextBytes(payload.bytes); + payload.length = len; + return payload; + } + public boolean hasPayloads() { for (BytesRef payload : payloads) { if (payload != null && payload.length > 0) { @@ -275,9 +277,40 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT return false; } + public String[] getTerms() { + return terms; + } + + public BytesRef[] getTermBytes() { + return termBytes; + } + + public int[] getPositionsIncrements() { + return positionsIncrements; + } + + public int[] getStartOffsets() { + return startOffsets; + } + + public int[] getEndOffsets() { + return endOffsets; + } + + public BytesRef[] getPayloads() { + return payloads; + } + + @Override + public void reset() throws IOException { + i = 0; + super.reset(); + } + @Override public final boolean incrementToken() throws IOException { if (i < terms.length) { + clearAttributes(); termAtt.setLength(0).append(terms[i]); piAtt.setPositionIncrement(positionsIncrements[i]); oAtt.setOffset(startOffsets[i], endOffsets[i]);