LUCENE-5240: additional safety in Tokenizer state machine

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1529482 13f79535-47bb-0310-9956-ffa450edef68
2013-10-05 16:41:28 +00:00 · 2013-10-05 16:41:28 +00:00 · 8fba601ed1
parent 2cd31e54b5
commit 8fba601ed1
14 changed files with 76 additions and 245 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -80,6 +80,10 @@ New Features
  on best effort which was not user-friendly.
  (Uwe Schindler, Robert Muir)

+* LUCENE-5240: Tokenizers now throw an IllegalStateException if the
+  consumer neglects to call close() on the previous stream before consuming
+  the next one. (Uwe Schindler, Robert Muir)
+
 * LUCENE-5214: Add new FreeTextSuggester, to predict the next word
  using a simple ngram language model.  This is useful for the "long
  tail" suggestions, when a primary suggester fails to find a
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
@ -48,6 +48,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
    assertEquals("the", term.toString());
    assertTrue(cgf.incrementToken());
    assertEquals("the_s", term.toString());
+    cgf.close();
    
    wt.setReader(new StringReader(input));
    cgf.reset();
@ -67,6 +68,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
    assertEquals("How_the", term.toString());
    assertTrue(nsf.incrementToken());
    assertEquals("the_s", term.toString());
+    nsf.close();
    
    wt.setReader(new StringReader(input));
    nsf.reset();
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@ -240,6 +240,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
    assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
    assertTrue(tf.incrementToken());
    assertEquals("Rind", termAtt.toString());
+    tf.end();
+    tf.close();
    wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
    tf.reset();
    assertTrue(tf.incrementToken());
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestExtendedMode.java
@ -59,6 +59,8 @@ public class TestExtendedMode extends BaseTokenStreamTestCase {
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
+      ts.end();
+      ts.close();
    }
  }
  
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@ -217,6 +217,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
      ts.reset();
      while (ts.incrementToken()) {
      }
+      ts.end();
+      ts.close();
    }
  }
  
@ -240,6 +242,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
+      ts.end();
+      ts.close();
    }
  }

@ -630,6 +634,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
      final TokenStream ts = analyzer.tokenStream("ignored", line);
      ts.reset();
      while(ts.incrementToken());
+      ts.end();
+      ts.close();
    }
    String[] sentences = line.split("、|。");
    if (VERBOSE) {
@ -642,6 +648,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
        final TokenStream ts = analyzer.tokenStream("ignored", sentence);
        ts.reset();
        while(ts.incrementToken());
+        ts.end();
+        ts.close();
      }
    }
    if (VERBOSE) {
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
@ -90,6 +90,8 @@ public class ReadTokensTask extends PerfTask {
        termAtt.fillBytesRef();
        tokenCount++;
      }
+      stream.end();
+      stream.close();
    }
    totalTokenCount += tokenCount;
    return tokenCount;
--- a/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java
@ -85,8 +85,9 @@ public abstract class Tokenizer extends TokenStream {
  public final void setReader(Reader input) throws IOException {
    if (input == null) {
      throw new NullPointerException("input must not be null");
+    } else if (this.input != ILLEGAL_STATE_READER) {
+      throw new IllegalStateException("TokenStream contract violation: close() call missing");
    }
-    this.input = ILLEGAL_STATE_READER;
    this.inputPending = input;
    assert setReaderTestPoint();
  }
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -401,6 +401,20 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
      ts.end();
      ts.close();
    }
+    
+    // check for a missing close()
+    ts = a.tokenStream("bogus", input);
+    ts.reset();
+    while (ts.incrementToken()) {}
+    ts.end();
+    try {
+      ts = a.tokenStream("bogus", input);
+      fail("didn't get expected exception when close() not called");
+    } catch (IllegalStateException expected) {
+      // ok
+    } finally {
+      ts.close();
+    }
  }

  // simple utility method for testing stemmers
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/CollationTestBase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/CollationTestBase.java
@ -266,6 +266,9 @@ public abstract class CollationTestBase extends LuceneTestCase {
      termAtt.fillBytesRef();
      // ensure we make a copy of the actual bytes too
      map.put(term, BytesRef.deepCopyOf(bytes));
+      assertFalse(ts.incrementToken());
+      ts.end();
+      ts.close();
    }
    
    Thread threads[] = new Thread[numThreads];
@ -284,6 +287,9 @@ public abstract class CollationTestBase extends LuceneTestCase {
              assertTrue(ts.incrementToken());
              termAtt.fillBytesRef();
              assertEquals(expected, bytes);
+              assertFalse(ts.incrementToken());
+              ts.end();
+              ts.close();
            }
          } catch (IOException e) {
            throw new RuntimeException(e);
--- a/solr/core/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java
@ -1,177 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.analysis;
-
-import org.apache.lucene.analysis.NumericTokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.util.TokenizerFactory;
-import org.apache.lucene.util.Attribute;
-import org.apache.lucene.util.AttributeImpl;
-import org.apache.lucene.util.AttributeSource.AttributeFactory;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.schema.DateField;
-import static org.apache.solr.schema.TrieField.TrieTypes;
-
-import java.io.IOException;
-import java.io.Reader;
-import java.util.HashMap;
-import java.util.Iterator;
-
-/**
- * Tokenizer for trie fields. It uses NumericTokenStream to create multiple trie encoded string per number.
- * Each string created by this tokenizer for a given number differs from the previous by the given precisionStep.
- * For query time token streams that only contain the highest precision term, use 32/64 as precisionStep.
- * <p/>
- * Refer to {@link org.apache.lucene.search.NumericRangeQuery} for more details.
- *
- *
- * @see org.apache.lucene.search.NumericRangeQuery
- * @see org.apache.solr.schema.TrieField
- * @since solr 1.4
- */
-public class TrieTokenizerFactory extends TokenizerFactory {
-  protected final int precisionStep;
-  protected final TrieTypes type;
-
-  public TrieTokenizerFactory(TrieTypes type, int precisionStep) {
-    super(new HashMap<String,String>());
-    this.type = type;
-    this.precisionStep = precisionStep;
-  }
-
-  @Override
-  public TrieTokenizer create(AttributeFactory factory, Reader input) {
-    return new TrieTokenizer(input, type, TrieTokenizer.getNumericTokenStream(factory, precisionStep));
-  }
-}
-
-final class TrieTokenizer extends Tokenizer {
-  protected static final DateField dateField = new DateField();
-  protected final TrieTypes type;
-  protected final NumericTokenStream ts;
-  
-  // NumericTokenStream does not support CharTermAttribute so keep it local
-  private final CharTermAttribute termAtt = new CharTermAttributeImpl();
-  protected final OffsetAttribute ofsAtt = addAttribute(OffsetAttribute.class);
-  protected int startOfs, endOfs;
-  protected boolean hasValue;
-
-  static NumericTokenStream getNumericTokenStream(AttributeFactory factory, int precisionStep) {
-    return new NumericTokenStream(factory, precisionStep);
-  }
-
-  public TrieTokenizer(Reader input, TrieTypes type, final NumericTokenStream ts) {
-    // Häckidy-Hick-Hack: must share the attributes with the NumericTokenStream we delegate to, so we create a fake factory:
-    super(new AttributeFactory() {
-      @Override
-      public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
-        return (AttributeImpl) ts.addAttribute(attClass);
-      }
-    }, input);
-    // add all attributes:
-    for (Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator(); it.hasNext();) {
-      addAttribute(it.next());
-    }
-    this.type = type;
-    this.ts = ts;
-    // dates tend to be longer, especially when math is involved
-    termAtt.resizeBuffer( type == TrieTypes.DATE ? 128 : 32 );
-  }
-
-  @Override
-  public void reset() throws IOException {
-    super.reset();
-    try {
-      int upto = 0;
-      char[] buf = termAtt.buffer();
-      while (true) {
-        final int length = input.read(buf, upto, buf.length-upto);
-        if (length == -1) break;
-        upto += length;
-        if (upto == buf.length)
-          buf = termAtt.resizeBuffer(1+buf.length);
-      }
-      termAtt.setLength(upto);
-      this.startOfs = correctOffset(0);
-      this.endOfs = correctOffset(upto);
-      
-      if (upto == 0) {
-        hasValue = false;
-        return;
-      }
-
-      final String v = new String(buf, 0, upto);
-      try {
-        switch (type) {
-          case INTEGER:
-            ts.setIntValue(Integer.parseInt(v));
-            break;
-          case FLOAT:
-            ts.setFloatValue(Float.parseFloat(v));
-            break;
-          case LONG:
-            ts.setLongValue(Long.parseLong(v));
-            break;
-          case DOUBLE:
-            ts.setDoubleValue(Double.parseDouble(v));
-            break;
-          case DATE:
-            ts.setLongValue(dateField.parseMath(null, v).getTime());
-            break;
-          default:
-            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field");
-        }
-      } catch (NumberFormatException nfe) {
-        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, 
-                                "Invalid Number: " + v);
-      }
-      hasValue = true;
-      ts.reset();
-    } catch (IOException e) {
-      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create TrieIndexTokenizer", e);
-    }
-  }
-
-  @Override
-  public void close() throws IOException {
-    super.close();
-    if (hasValue) {
-      ts.close();
-    }
-  }
-
-  @Override
-  public boolean incrementToken() {
-    if (hasValue && ts.incrementToken()) {
-      ofsAtt.setOffset(startOfs, endOfs);
-      return true;
-    }
-    return false;
-  }
-
-  @Override
-  public void end() throws IOException {
-    super.end();
-    if (hasValue) {
-      ts.end();
-    }
-    ofsAtt.setOffset(endOfs, endOfs);
-  }
-}
--- a/solr/core/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java
+++ b/solr/core/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java
@ -30,6 +30,7 @@ import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.AttributeReflector;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.IOUtils;
 import org.apache.solr.analysis.TokenizerChain;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
@ -138,9 +139,10 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
   * @param analyzer The analyzer to use.
   */
  protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) {
+    TokenStream tokenStream = null;
    try {
      final Set<BytesRef> tokens = new HashSet<BytesRef>();
-      final TokenStream tokenStream = analyzer.tokenStream("", query);
+      tokenStream = analyzer.tokenStream("", query);
      final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
      final BytesRef bytes = bytesAtt.getBytesRef();

@ -152,10 +154,11 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
      }

      tokenStream.end();
-      tokenStream.close();
      return tokens;
    } catch (IOException ioe) {
      throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
+    } finally {
+      IOUtils.closeWhileHandlingException(tokenStream);
    }
  }

@ -181,8 +184,11 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
        trackerAtt.setActPosition(position);
        tokens.add(tokenStream.cloneAttributes());
      }
+      tokenStream.end();
    } catch (IOException ioe) {
      throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
+    } finally {
+      IOUtils.closeWhileHandlingException(tokenStream);
    }

    return tokens;
--- a/solr/core/src/java/org/apache/solr/schema/TrieField.java
+++ b/solr/core/src/java/org/apache/solr/schema/TrieField.java
@ -24,8 +24,6 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Map;

-import org.apache.lucene.analysis.util.CharFilterFactory;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
 import org.apache.lucene.document.DoubleField;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.FieldType.NumericType;
@ -51,8 +49,6 @@ import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.NumericUtils;
 import org.apache.lucene.util.mutable.MutableValueDate;
 import org.apache.lucene.util.mutable.MutableValueLong;
-import org.apache.solr.analysis.TokenizerChain;
-import org.apache.solr.analysis.TrieTokenizerFactory;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.response.TextResponseWriter;
 import org.apache.solr.search.QParser;
@ -111,12 +107,6 @@ public class TrieField extends PrimitiveFieldType {
                "Invalid type specified in schema.xml for field: " + args.get("name"), e);
      }
    }
-
-    CharFilterFactory[] filterFactories = new CharFilterFactory[0];
-    TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
-    analyzer = new TokenizerChain(filterFactories, new TrieTokenizerFactory(type, precisionStep), tokenFilterFactories);
-    // for query time we only need one token, so we use the biggest possible precisionStep:
-    queryAnalyzer = new TokenizerChain(filterFactories, new TrieTokenizerFactory(type, Integer.MAX_VALUE), tokenFilterFactories);
  }

  @Override
@ -223,7 +213,7 @@ public class TrieField extends PrimitiveFieldType {

  @Override
  public boolean isTokenized() {
-    return true;
+    return false;
  }

  @Override
@ -382,6 +372,7 @@ public class TrieField extends PrimitiveFieldType {
  @Override
  public void readableToIndexed(CharSequence val, BytesRef result) {
    String s = val.toString();
+    try {
      switch (type) {
        case INTEGER:
          NumericUtils.intToPrefixCodedBytes(Integer.parseInt(s), 0, result);
@ -401,6 +392,10 @@ public class TrieField extends PrimitiveFieldType {
        default:
          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type);
      }
+    } catch (NumberFormatException nfe) {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, 
+                              "Invalid Number: " + val);
+    }
  }

  @Override
--- a/solr/core/src/test/org/apache/solr/TestTrie.java
+++ b/solr/core/src/test/org/apache/solr/TestTrie.java
@ -16,8 +16,6 @@
 */
 package org.apache.solr;

-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.schema.DateField;
 import org.apache.solr.schema.FieldType;
@ -50,38 +48,6 @@ public class TestTrie extends SolrTestCaseJ4 {
    super.tearDown();
  }

-  @Test
-  public void testTokenizer() throws Exception {
-    FieldType type = h.getCore().getLatestSchema().getFieldType("tint");
-    assertTrue(type instanceof TrieField);
-    
-    String value = String.valueOf(random().nextInt());
-    TokenStream ts = type.getAnalyzer().tokenStream("dummy", value);
-    OffsetAttribute ofsAtt = ts.addAttribute(OffsetAttribute.class);
-    ts.reset();
-    int count = 0;
-    while (ts.incrementToken()) {
-      count++;
-      assertEquals(0, ofsAtt.startOffset());
-      assertEquals(value.length(), ofsAtt.endOffset());
-    }
-    final int precStep = ((TrieField) type).getPrecisionStep();
-    assertEquals( (32 + precStep - 1) / precStep, count);
-    ts.end();
-    assertEquals(value.length(), ofsAtt.startOffset());
-    assertEquals(value.length(), ofsAtt.endOffset());
-    ts.close();
-    
-    // Test empty one:
-    ts = type.getAnalyzer().tokenStream("dummy", "");
-    ts.reset();
-    assertFalse(ts.incrementToken());
-    ts.end();
-    assertEquals(0, ofsAtt.startOffset());
-    assertEquals(0, ofsAtt.endOffset());
-    ts.close();    
-  }
-
  @Test
  public void testTrieIntRangeSearch() throws Exception {
    for (int i = 0; i < 10; i++) {
--- a/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java
+++ b/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java
@ -39,7 +39,7 @@ public class TestFieldTypeResource extends SolrRestletTestBase {
            "/response/lst[@name='fieldType']/bool[@name='omitPositions'] = 'false'",
            "/response/lst[@name='fieldType']/bool[@name='storeOffsetsWithPositions'] = 'false'",
            "/response/lst[@name='fieldType']/bool[@name='multiValued'] = 'false'",
-            "/response/lst[@name='fieldType']/bool[@name='tokenized'] = 'true'",
+            "/response/lst[@name='fieldType']/bool[@name='tokenized'] = 'false'",
            "/response/lst[@name='fieldType']/arr[@name='fields']/str = 'weight'",
            "/response/lst[@name='fieldType']/arr[@name='dynamicFields']/str = '*_f'");
  }
@ -69,7 +69,7 @@ public class TestFieldTypeResource extends SolrRestletTestBase {
             "/fieldType/omitPositions==false",
             "/fieldType/storeOffsetsWithPositions==false",
             "/fieldType/multiValued==false",
-             "/fieldType/tokenized==true",
+             "/fieldType/tokenized==false",
             "/fieldType/fields==['weight']",
             "/fieldType/dynamicFields==['*_f']");
  }