LUCENE-6121: CachingTokenFilter.reset() propagates to input if not cached

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1646737 13f79535-47bb-0310-9956-ffa450edef68
2014-12-19 14:38:02 +00:00 · 2014-12-19 14:38:02 +00:00 · e4180d30bb
parent 5a9639c3e6
commit e4180d30bb
9 changed files with 79 additions and 51 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -302,6 +302,10 @@ API Changes
 * LUCENE-6099: Add FilterDirectory.unwrap and
  FilterDirectoryReader.unwrap (Simon Willnauer, Mike McCandless)

+* LUCENE-6121: CachingTokenFilter.reset() now propagates to its input if called before
+  incrementToken().  You must call reset() now on this filter instead of doing it a-priori on the
+  input(), which previously didn't work.  (David Smiley, Robert Muir)
+
 Bug Fixes

 * LUCENE-5650: Enforce read-only access to any path outside the temporary
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java
@ -143,9 +143,8 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
    final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer1.toString()));
    final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
    final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
-    tee1.reset();
    final TokenStream source1 = new CachingTokenFilter(tee1);
-    
+
    tee1.addAttribute(CheckClearAttributesAttribute.class);
    dogDetector.addAttribute(CheckClearAttributesAttribute.class);
    theDetector.addAttribute(CheckClearAttributesAttribute.class);
@ -163,7 +162,6 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
    assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"});
    assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"});
    
-    source1.reset();
    TokenStream lowerCasing = new LowerCaseFilter(source1);
    String[] lowerCaseTokens = new String[tokens1.length];
    for (int i = 0; i < tokens1.length; i++)
--- a/lucene/core/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
@ -28,11 +28,11 @@ import org.apache.lucene.util.AttributeSource;
 * This class can be used if the token attributes of a TokenStream
 * are intended to be consumed more than once. It caches
 * all token attribute states locally in a List when the first call to
- * {@link #incrementToken()} is called.
- * 
- * <P>CachingTokenFilter implements the optional method
- * {@link TokenStream#reset()}, which repositions the
- * stream to the first Token. 
+ * {@link #incrementToken()} is called. Subsequent calls will used the cache.
+ * <p/>
+ * <em>Important:</em> Like any proper TokenFilter, {@link #reset()} propagates
+ * to the input, although only before {@link #incrementToken()} is called the
+ * first time. Prior to  Lucene 5, it was never propagated.
 */
 public final class CachingTokenFilter extends TokenFilter {
  private List<AttributeSource.State> cache = null;
@ -40,17 +40,31 @@ public final class CachingTokenFilter extends TokenFilter {
  private AttributeSource.State finalState;
  
  /**
-   * Create a new CachingTokenFilter around <code>input</code>,
-   * caching its token attributes, which can be replayed again
-   * after a call to {@link #reset()}.
+   * Create a new CachingTokenFilter around <code>input</code>. As with
+   * any normal TokenFilter, do <em>not</em> call reset on the input; this filter
+   * will do it normally.
   */
  public CachingTokenFilter(TokenStream input) {
    super(input);
  }
-  
+
+  /**
+   * Propagates reset if incrementToken has not yet been called. Otherwise
+   * it rewinds the iterator to the beginning of the cached list.
+   */
+  @Override
+  public void reset() throws IOException {
+    if (cache == null) {//first time
+      input.reset();
+    } else {
+      iterator = cache.iterator();
+    }
+  }
+
+  /** The first time called, it'll read and cache all tokens from the input. */
  @Override
  public final boolean incrementToken() throws IOException {
-    if (cache == null) {
+    if (cache == null) {//first-time
      // fill cache lazily
      cache = new ArrayList<>(64);
      fillCache();
@ -65,7 +79,7 @@ public final class CachingTokenFilter extends TokenFilter {
    restoreState(iterator.next());
    return true;
  }
-  
+
  @Override
  public final void end() {
    if (finalState != null) {
@ -73,20 +87,6 @@ public final class CachingTokenFilter extends TokenFilter {
    }
  }

-  /**
-   * Rewinds the iterator to the beginning of the cached list.
-   * <p>
-   * Note that this does not call reset() on the wrapped tokenstream ever, even
-   * the first time. You should reset() the inner tokenstream before wrapping
-   * it with CachingTokenFilter.
-   */
-  @Override
-  public void reset() {
-    if (cache != null) {
-      iterator = cache.iterator();
-    }
-  }
-  
  private void fillCache() throws IOException {
    while (input.incrementToken()) {
      cache.add(captureState());
--- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
@ -203,7 +203,6 @@ public class QueryBuilder {
    boolean hasMoreTokens = false;    
    
    try (TokenStream source = analyzer.tokenStream(field, queryText)) {
-      source.reset();
      buffer = new CachingTokenFilter(source);
      buffer.reset();

@ -226,13 +225,13 @@ public class QueryBuilder {
        } catch (IOException e) {
          // ignore
        }
+
+        // rewind the buffer stream
+        buffer.reset();//will never through on subsequent reset calls
      }
    } catch (IOException e) {
      throw new RuntimeException("Error analyzing query text", e);
    }
-    
-    // rewind the buffer stream
-    buffer.reset();

    BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();

--- a/lucene/core/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
@ -19,14 +19,15 @@ package org.apache.lucene.analysis;


 import java.io.IOException;
+import java.util.concurrent.atomic.AtomicInteger;

-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.store.Directory;
@ -39,11 +40,18 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
+    AtomicInteger resetCount = new AtomicInteger(0);
    TokenStream stream = new TokenStream() {
      private int index = 0;
      private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
      private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-      
+
+      @Override
+      public void reset() throws IOException {
+        super.reset();
+        resetCount.incrementAndGet();
+      }
+
      @Override
      public boolean incrementToken() {
        if (index == tokens.length) {
@ -57,16 +65,20 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
      }
      
    };
-    
+
    stream = new CachingTokenFilter(stream);
-    
+
    doc.add(new TextField("preanalyzed", stream));
-    
+
    // 1) we consume all tokens twice before we add the doc to the index
+    assertFalse(((CachingTokenFilter)stream).isCached());
+    stream.reset();
+    assertFalse(((CachingTokenFilter) stream).isCached());
    checkTokens(stream);
    stream.reset();  
    checkTokens(stream);
-    
+    assertTrue(((CachingTokenFilter)stream).isCached());
+
    // 2) now add the document to the index and verify if all tokens are indexed
    //    don't reset the stream here, the DocumentWriter should do that implicitly
    writer.addDocument(doc);
@ -101,8 +113,26 @@ public class TestCachingTokenFilter extends BaseTokenStreamTestCase {
    // 3) reset stream and consume tokens again
    stream.reset();
    checkTokens(stream);
+
+    assertEquals(1, resetCount.get());
+
    dir.close();
  }
+
+  public void testDoubleResetFails() throws IOException {
+    Analyzer analyzer = new MockAnalyzer(random());
+    final TokenStream input = analyzer.tokenStream("field", "abc");
+    CachingTokenFilter buffer = new CachingTokenFilter(input);
+    buffer.reset();//ok
+    boolean madeIt = false;
+    try {
+      buffer.reset();//bad (this used to work which we don't want)
+      madeIt = true;
+    } catch (Throwable e) {
+      //ignore
+    }
+    assertFalse(madeIt);
+  }
  
  private void checkTokens(TokenStream stream) throws IOException {
    int count = 0;
--- a/lucene/core/src/test/org/apache/lucene/index/TestTermVectorsWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestTermVectorsWriter.java
@ -175,14 +175,12 @@ public class TestTermVectorsWriter extends LuceneTestCase {
    Analyzer analyzer = new MockAnalyzer(random());
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
    Document doc = new Document();
-    try (TokenStream stream = analyzer.tokenStream("field", "abcd   ")) {
-      stream.reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct?
-      TokenStream cachedStream = new CachingTokenFilter(stream);
+    try (TokenStream stream = new CachingTokenFilter(analyzer.tokenStream("field", "abcd   "))) {
      FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
      customType.setStoreTermVectors(true);
      customType.setStoreTermVectorPositions(true);
      customType.setStoreTermVectorOffsets(true);
-      Field f = new Field("field", cachedStream, customType);
+      Field f = new Field("field", stream, customType);
      doc.add(f);
      doc.add(f);
      w.addDocument(doc);
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
@ -187,7 +187,6 @@ public class Highlighter

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
-    tokenStream.reset();
    TextFragment currentFrag =  new TextFragment(newText,newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
@ -214,6 +213,7 @@ public class Highlighter

      TokenGroup tokenGroup=new TokenGroup(tokenStream);

+      tokenStream.reset();
      for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
            next = tokenStream.incrementToken())
      {
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
@ -394,7 +394,6 @@ public class WeightedSpanTermExtractor {
          indexer.addField(DelegatingLeafReader.FIELD_NAME,
              new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
        }
-        tokenStream.reset();//reset to beginning when we return
        final IndexSearcher searcher = indexer.createSearcher();
        // MEM index has only atomic ctx
        internalReader = ((LeafReaderContext) searcher.getTopReaderContext()).reader();
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/processors/AnalyzerQueryNodeProcessor.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/processors/AnalyzerQueryNodeProcessor.java
@ -130,9 +130,9 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
      
      try {
        try (TokenStream source = this.analyzer.tokenStream(field, text)) {
-          source.reset();
          buffer = new CachingTokenFilter(source);
-  
+          buffer.reset();
+
          if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
            posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
          }
@ -155,13 +155,13 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
          } catch (IOException e) {
            // ignore
          }
+
+          // rewind the buffer stream
+          buffer.reset();//will never through on subsequent reset calls
        } catch (IOException e) {
          throw new RuntimeException(e);
        }
-        
-        // rewind the buffer stream
-        buffer.reset();
-  
+
        if (!buffer.hasAttribute(CharTermAttribute.class)) {
          return new NoTokenFoundQueryNode();
        }