mirror of https://github.com/apache/lucene.git
LUCENE-6121: CachingTokenFilter.reset() propagates to input if not cached
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1646737 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5a9639c3e6
commit
e4180d30bb
|
@ -302,6 +302,10 @@ API Changes
|
|||
* LUCENE-6099: Add FilterDirectory.unwrap and
|
||||
FilterDirectoryReader.unwrap (Simon Willnauer, Mike McCandless)
|
||||
|
||||
* LUCENE-6121: CachingTokenFilter.reset() now propagates to its input if called before
|
||||
incrementToken(). You must call reset() now on this filter instead of doing it a-priori on the
|
||||
input(), which previously didn't work. (David Smiley, Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-5650: Enforce read-only access to any path outside the temporary
|
||||
|
|
|
@ -143,9 +143,8 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer1.toString()));
|
||||
final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
|
||||
final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
|
||||
tee1.reset();
|
||||
final TokenStream source1 = new CachingTokenFilter(tee1);
|
||||
|
||||
|
||||
tee1.addAttribute(CheckClearAttributesAttribute.class);
|
||||
dogDetector.addAttribute(CheckClearAttributesAttribute.class);
|
||||
theDetector.addAttribute(CheckClearAttributesAttribute.class);
|
||||
|
@ -163,7 +162,6 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"});
|
||||
assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"});
|
||||
|
||||
source1.reset();
|
||||
TokenStream lowerCasing = new LowerCaseFilter(source1);
|
||||
String[] lowerCaseTokens = new String[tokens1.length];
|
||||
for (int i = 0; i < tokens1.length; i++)
|
||||
|
|
|
@ -28,11 +28,11 @@ import org.apache.lucene.util.AttributeSource;
|
|||
* This class can be used if the token attributes of a TokenStream
|
||||
* are intended to be consumed more than once. It caches
|
||||
* all token attribute states locally in a List when the first call to
|
||||
* {@link #incrementToken()} is called.
|
||||
*
|
||||
* <P>CachingTokenFilter implements the optional method
|
||||
* {@link TokenStream#reset()}, which repositions the
|
||||
* stream to the first Token.
|
||||
* {@link #incrementToken()} is called. Subsequent calls will used the cache.
|
||||
* <p/>
|
||||
* <em>Important:</em> Like any proper TokenFilter, {@link #reset()} propagates
|
||||
* to the input, although only before {@link #incrementToken()} is called the
|
||||
* first time. Prior to Lucene 5, it was never propagated.
|
||||
*/
|
||||
public final class CachingTokenFilter extends TokenFilter {
|
||||
private List<AttributeSource.State> cache = null;
|
||||
|
@ -40,17 +40,31 @@ public final class CachingTokenFilter extends TokenFilter {
|
|||
private AttributeSource.State finalState;
|
||||
|
||||
/**
|
||||
* Create a new CachingTokenFilter around <code>input</code>,
|
||||
* caching its token attributes, which can be replayed again
|
||||
* after a call to {@link #reset()}.
|
||||
* Create a new CachingTokenFilter around <code>input</code>. As with
|
||||
* any normal TokenFilter, do <em>not</em> call reset on the input; this filter
|
||||
* will do it normally.
|
||||
*/
|
||||
public CachingTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Propagates reset if incrementToken has not yet been called. Otherwise
|
||||
* it rewinds the iterator to the beginning of the cached list.
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
if (cache == null) {//first time
|
||||
input.reset();
|
||||
} else {
|
||||
iterator = cache.iterator();
|
||||
}
|
||||
}
|
||||
|
||||
/** The first time called, it'll read and cache all tokens from the input. */
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (cache == null) {
|
||||
if (cache == null) {//first-time
|
||||
// fill cache lazily
|
||||
cache = new ArrayList<>(64);
|
||||
fillCache();
|
||||
|
@ -65,7 +79,7 @@ public final class CachingTokenFilter extends TokenFilter {
|
|||
restoreState(iterator.next());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
if (finalState != null) {
|
||||
|
@ -73,20 +87,6 @@ public final class CachingTokenFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Rewinds the iterator to the beginning of the cached list.
|
||||
* <p>
|
||||
* Note that this does not call reset() on the wrapped tokenstream ever, even
|
||||
* the first time. You should reset() the inner tokenstream before wrapping
|
||||
* it with CachingTokenFilter.
|
||||
*/
|
||||
@Override
|
||||
public void reset() {
|
||||
if (cache != null) {
|
||||
iterator = cache.iterator();
|
||||
}
|
||||
}
|
||||
|
||||
private void fillCache() throws IOException {
|
||||
while (input.incrementToken()) {
|
||||
cache.add(captureState());
|
||||
|
|
|
@ -203,7 +203,6 @@ public class QueryBuilder {
|
|||
boolean hasMoreTokens = false;
|
||||
|
||||
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
|
||||
source.reset();
|
||||
buffer = new CachingTokenFilter(source);
|
||||
buffer.reset();
|
||||
|
||||
|
@ -226,13 +225,13 @@ public class QueryBuilder {
|
|||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
// rewind the buffer stream
|
||||
buffer.reset();//will never through on subsequent reset calls
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Error analyzing query text", e);
|
||||
}
|
||||
|
||||
// rewind the buffer stream
|
||||
buffer.reset();
|
||||
|
||||
BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();
|
||||
|
||||
|
|
|
@ -19,14 +19,15 @@ package org.apache.lucene.analysis;
|
|||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
@ -39,11 +40,18 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
|
|||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
AtomicInteger resetCount = new AtomicInteger(0);
|
||||
TokenStream stream = new TokenStream() {
|
||||
private int index = 0;
|
||||
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
resetCount.incrementAndGet();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() {
|
||||
if (index == tokens.length) {
|
||||
|
@ -57,16 +65,20 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
stream = new CachingTokenFilter(stream);
|
||||
|
||||
|
||||
doc.add(new TextField("preanalyzed", stream));
|
||||
|
||||
|
||||
// 1) we consume all tokens twice before we add the doc to the index
|
||||
assertFalse(((CachingTokenFilter)stream).isCached());
|
||||
stream.reset();
|
||||
assertFalse(((CachingTokenFilter) stream).isCached());
|
||||
checkTokens(stream);
|
||||
stream.reset();
|
||||
checkTokens(stream);
|
||||
|
||||
assertTrue(((CachingTokenFilter)stream).isCached());
|
||||
|
||||
// 2) now add the document to the index and verify if all tokens are indexed
|
||||
// don't reset the stream here, the DocumentWriter should do that implicitly
|
||||
writer.addDocument(doc);
|
||||
|
@ -101,8 +113,26 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
|
|||
// 3) reset stream and consume tokens again
|
||||
stream.reset();
|
||||
checkTokens(stream);
|
||||
|
||||
assertEquals(1, resetCount.get());
|
||||
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testDoubleResetFails() throws IOException {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
final TokenStream input = analyzer.tokenStream("field", "abc");
|
||||
CachingTokenFilter buffer = new CachingTokenFilter(input);
|
||||
buffer.reset();//ok
|
||||
boolean madeIt = false;
|
||||
try {
|
||||
buffer.reset();//bad (this used to work which we don't want)
|
||||
madeIt = true;
|
||||
} catch (Throwable e) {
|
||||
//ignore
|
||||
}
|
||||
assertFalse(madeIt);
|
||||
}
|
||||
|
||||
private void checkTokens(TokenStream stream) throws IOException {
|
||||
int count = 0;
|
||||
|
|
|
@ -175,14 +175,12 @@ public class TestTermVectorsWriter extends LuceneTestCase {
|
|||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
|
||||
Document doc = new Document();
|
||||
try (TokenStream stream = analyzer.tokenStream("field", "abcd ")) {
|
||||
stream.reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct?
|
||||
TokenStream cachedStream = new CachingTokenFilter(stream);
|
||||
try (TokenStream stream = new CachingTokenFilter(analyzer.tokenStream("field", "abcd "))) {
|
||||
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
customType.setStoreTermVectors(true);
|
||||
customType.setStoreTermVectorPositions(true);
|
||||
customType.setStoreTermVectorOffsets(true);
|
||||
Field f = new Field("field", cachedStream, customType);
|
||||
Field f = new Field("field", stream, customType);
|
||||
doc.add(f);
|
||||
doc.add(f);
|
||||
w.addDocument(doc);
|
||||
|
|
|
@ -187,7 +187,6 @@ public class Highlighter
|
|||
|
||||
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
|
||||
tokenStream.reset();
|
||||
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
|
||||
|
||||
if (fragmentScorer instanceof QueryScorer) {
|
||||
|
@ -214,6 +213,7 @@ public class Highlighter
|
|||
|
||||
TokenGroup tokenGroup=new TokenGroup(tokenStream);
|
||||
|
||||
tokenStream.reset();
|
||||
for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
|
||||
next = tokenStream.incrementToken())
|
||||
{
|
||||
|
|
|
@ -394,7 +394,6 @@ public class WeightedSpanTermExtractor {
|
|||
indexer.addField(DelegatingLeafReader.FIELD_NAME,
|
||||
new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
|
||||
}
|
||||
tokenStream.reset();//reset to beginning when we return
|
||||
final IndexSearcher searcher = indexer.createSearcher();
|
||||
// MEM index has only atomic ctx
|
||||
internalReader = ((LeafReaderContext) searcher.getTopReaderContext()).reader();
|
||||
|
|
|
@ -130,9 +130,9 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
|
|||
|
||||
try {
|
||||
try (TokenStream source = this.analyzer.tokenStream(field, text)) {
|
||||
source.reset();
|
||||
buffer = new CachingTokenFilter(source);
|
||||
|
||||
buffer.reset();
|
||||
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
@ -155,13 +155,13 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
|
|||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
// rewind the buffer stream
|
||||
buffer.reset();//will never through on subsequent reset calls
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
// rewind the buffer stream
|
||||
buffer.reset();
|
||||
|
||||
|
||||
if (!buffer.hasAttribute(CharTermAttribute.class)) {
|
||||
return new NoTokenFoundQueryNode();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue