LUCENE-6121: CachingTokenFilter.reset() propagates to input if not cached

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1646737 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
David Wayne Smiley 2014-12-19 14:38:02 +00:00
parent 5a9639c3e6
commit e4180d30bb
9 changed files with 79 additions and 51 deletions

View File

@ -302,6 +302,10 @@ API Changes
* LUCENE-6099: Add FilterDirectory.unwrap and
FilterDirectoryReader.unwrap (Simon Willnauer, Mike McCandless)
* LUCENE-6121: CachingTokenFilter.reset() now propagates to its input if called before
incrementToken(). You must call reset() now on this filter instead of doing it a-priori on the
input(), which previously didn't work. (David Smiley, Robert Muir)
Bug Fixes
* LUCENE-5650: Enforce read-only access to any path outside the temporary

View File

@ -143,9 +143,8 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer1.toString()));
final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
tee1.reset();
final TokenStream source1 = new CachingTokenFilter(tee1);
tee1.addAttribute(CheckClearAttributesAttribute.class);
dogDetector.addAttribute(CheckClearAttributesAttribute.class);
theDetector.addAttribute(CheckClearAttributesAttribute.class);
@ -163,7 +162,6 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"});
assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"});
source1.reset();
TokenStream lowerCasing = new LowerCaseFilter(source1);
String[] lowerCaseTokens = new String[tokens1.length];
for (int i = 0; i < tokens1.length; i++)

View File

@ -28,11 +28,11 @@ import org.apache.lucene.util.AttributeSource;
* This class can be used if the token attributes of a TokenStream
* are intended to be consumed more than once. It caches
* all token attribute states locally in a List when the first call to
* {@link #incrementToken()} is called.
*
* <P>CachingTokenFilter implements the optional method
* {@link TokenStream#reset()}, which repositions the
* stream to the first Token.
* {@link #incrementToken()} is called. Subsequent calls will used the cache.
* <p/>
* <em>Important:</em> Like any proper TokenFilter, {@link #reset()} propagates
* to the input, although only before {@link #incrementToken()} is called the
* first time. Prior to Lucene 5, it was never propagated.
*/
public final class CachingTokenFilter extends TokenFilter {
private List<AttributeSource.State> cache = null;
@ -40,17 +40,31 @@ public final class CachingTokenFilter extends TokenFilter {
private AttributeSource.State finalState;
/**
* Create a new CachingTokenFilter around <code>input</code>,
* caching its token attributes, which can be replayed again
* after a call to {@link #reset()}.
* Create a new CachingTokenFilter around <code>input</code>. As with
* any normal TokenFilter, do <em>not</em> call reset on the input; this filter
* will do it normally.
*/
public CachingTokenFilter(TokenStream input) {
super(input);
}
/**
* Propagates reset if incrementToken has not yet been called. Otherwise
* it rewinds the iterator to the beginning of the cached list.
*/
@Override
public void reset() throws IOException {
if (cache == null) {//first time
input.reset();
} else {
iterator = cache.iterator();
}
}
/** The first time called, it'll read and cache all tokens from the input. */
@Override
public final boolean incrementToken() throws IOException {
if (cache == null) {
if (cache == null) {//first-time
// fill cache lazily
cache = new ArrayList<>(64);
fillCache();
@ -65,7 +79,7 @@ public final class CachingTokenFilter extends TokenFilter {
restoreState(iterator.next());
return true;
}
@Override
public final void end() {
if (finalState != null) {
@ -73,20 +87,6 @@ public final class CachingTokenFilter extends TokenFilter {
}
}
/**
* Rewinds the iterator to the beginning of the cached list.
* <p>
* Note that this does not call reset() on the wrapped tokenstream ever, even
* the first time. You should reset() the inner tokenstream before wrapping
* it with CachingTokenFilter.
*/
@Override
public void reset() {
if (cache != null) {
iterator = cache.iterator();
}
}
private void fillCache() throws IOException {
while (input.incrementToken()) {
cache.add(captureState());

View File

@ -203,7 +203,6 @@ public class QueryBuilder {
boolean hasMoreTokens = false;
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
source.reset();
buffer = new CachingTokenFilter(source);
buffer.reset();
@ -226,13 +225,13 @@ public class QueryBuilder {
} catch (IOException e) {
// ignore
}
// rewind the buffer stream
buffer.reset();//will never through on subsequent reset calls
}
} catch (IOException e) {
throw new RuntimeException("Error analyzing query text", e);
}
// rewind the buffer stream
buffer.reset();
BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();

View File

@ -19,14 +19,15 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
@ -39,11 +40,18 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
AtomicInteger resetCount = new AtomicInteger(0);
TokenStream stream = new TokenStream() {
private int index = 0;
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@Override
public void reset() throws IOException {
super.reset();
resetCount.incrementAndGet();
}
@Override
public boolean incrementToken() {
if (index == tokens.length) {
@ -57,16 +65,20 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
}
};
stream = new CachingTokenFilter(stream);
doc.add(new TextField("preanalyzed", stream));
// 1) we consume all tokens twice before we add the doc to the index
assertFalse(((CachingTokenFilter)stream).isCached());
stream.reset();
assertFalse(((CachingTokenFilter) stream).isCached());
checkTokens(stream);
stream.reset();
checkTokens(stream);
assertTrue(((CachingTokenFilter)stream).isCached());
// 2) now add the document to the index and verify if all tokens are indexed
// don't reset the stream here, the DocumentWriter should do that implicitly
writer.addDocument(doc);
@ -101,8 +113,26 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
// 3) reset stream and consume tokens again
stream.reset();
checkTokens(stream);
assertEquals(1, resetCount.get());
dir.close();
}
public void testDoubleResetFails() throws IOException {
Analyzer analyzer = new MockAnalyzer(random());
final TokenStream input = analyzer.tokenStream("field", "abc");
CachingTokenFilter buffer = new CachingTokenFilter(input);
buffer.reset();//ok
boolean madeIt = false;
try {
buffer.reset();//bad (this used to work which we don't want)
madeIt = true;
} catch (Throwable e) {
//ignore
}
assertFalse(madeIt);
}
private void checkTokens(TokenStream stream) throws IOException {
int count = 0;

View File

@ -175,14 +175,12 @@ public class TestTermVectorsWriter extends LuceneTestCase {
Analyzer analyzer = new MockAnalyzer(random());
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
Document doc = new Document();
try (TokenStream stream = analyzer.tokenStream("field", "abcd ")) {
stream.reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct?
TokenStream cachedStream = new CachingTokenFilter(stream);
try (TokenStream stream = new CachingTokenFilter(analyzer.tokenStream("field", "abcd "))) {
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectorOffsets(true);
Field f = new Field("field", cachedStream, customType);
Field f = new Field("field", stream, customType);
doc.add(f);
doc.add(f);
w.addDocument(doc);

View File

@ -187,7 +187,6 @@ public class Highlighter
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.reset();
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
if (fragmentScorer instanceof QueryScorer) {
@ -214,6 +213,7 @@ public class Highlighter
TokenGroup tokenGroup=new TokenGroup(tokenStream);
tokenStream.reset();
for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
next = tokenStream.incrementToken())
{

View File

@ -394,7 +394,6 @@ public class WeightedSpanTermExtractor {
indexer.addField(DelegatingLeafReader.FIELD_NAME,
new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
}
tokenStream.reset();//reset to beginning when we return
final IndexSearcher searcher = indexer.createSearcher();
// MEM index has only atomic ctx
internalReader = ((LeafReaderContext) searcher.getTopReaderContext()).reader();

View File

@ -130,9 +130,9 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
try {
try (TokenStream source = this.analyzer.tokenStream(field, text)) {
source.reset();
buffer = new CachingTokenFilter(source);
buffer.reset();
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
}
@ -155,13 +155,13 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
} catch (IOException e) {
// ignore
}
// rewind the buffer stream
buffer.reset();//will never through on subsequent reset calls
} catch (IOException e) {
throw new RuntimeException(e);
}
// rewind the buffer stream
buffer.reset();
if (!buffer.hasAttribute(CharTermAttribute.class)) {
return new NoTokenFoundQueryNode();
}