From 257ea3423f5b03495e08a8aa4263f9520deb662b Mon Sep 17 00:00:00 2001
From: Mike McCandless <mikemccand@apache.org>
Date: Mon, 10 Oct 2016 11:37:23 -0400
Subject: [PATCH] LUCENE-7476: JapaneseNumberFilter should not invoke
 incrementToken on its input after it's exhausted

---
 lucene/CHANGES.txt                            |   3 +
 .../analysis/ja/JapaneseNumberFilter.java     |  11 +
 .../lucene/analysis/ja/TestFactories.java     | 203 ++++++++++++++++++
 3 files changed, 217 insertions(+)
 create mode 100644 lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3fc306ecc72..0647df016f2 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -67,6 +67,9 @@ Bug Fixes
 * LUCENE-7484: FastVectorHighlighter failed to highlight SynonymQuery
   (Jim Ferenczi via Mike McCandless)
 
+* LUCENE-7476: JapaneseNumberFilter should not invoke incrementToken
+  on its input after it's exhausted (Andy Hind via Mike McCandless)
+
 Improvements
 
 * LUCENE-7439: FuzzyQuery now matches all terms within the specified
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
index a1af95e271b..9f4c1d555df 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
@@ -104,6 +104,8 @@ public class JapaneseNumberFilter extends TokenFilter {
   private StringBuilder numeral;
 
   private int fallThroughTokens;
+  
+  private boolean exhausted = false;
 
   static {
     numerals = new char[0x10000];
@@ -149,7 +151,12 @@ public class JapaneseNumberFilter extends TokenFilter {
       return true;
     }
 
+    if (exhausted) {
+      return false;
+    }
+    
     if (!input.incrementToken()) {
+      exhausted = true;
       return false;
     }
 
@@ -184,6 +191,9 @@ public class JapaneseNumberFilter extends TokenFilter {
 
       endOffset = offsetAttr.endOffset();
       moreTokens = input.incrementToken();
+      if (moreTokens == false) {
+        exhausted = true;
+      }
 
       if (posIncrAttr.getPositionIncrement() == 0) {
         // This token is a stacked/synonym token, capture number of tokens "under" this token,
@@ -227,6 +237,7 @@ public class JapaneseNumberFilter extends TokenFilter {
     fallThroughTokens = 0;
     numeral = new StringBuilder();
     state = null;
+    exhausted = false;
   }
 
   /**
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java
new file mode 100644
index 00000000000..bae8ffc8f53
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+
+import java.io.IOException;
+import java.io.Reader;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import org.apache.lucene.analysis.util.CharFilterFactory;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.Version;
+
+/**
+ * Sanity check some things about all factories,
+ * we do our best to see if we can sanely initialize it with
+ * no parameters and smoke test it, etc.
+ */
+// TODO: this was copied from the analysis/common module ... find a better way to share it!
+
+// TODO: fix this to use CustomAnalyzer instead of its own FactoryAnalyzer
+public class TestFactories extends BaseTokenStreamTestCase {
+  public void test() throws IOException {
+    for (String tokenizer : TokenizerFactory.availableTokenizers()) {
+      doTestTokenizer(tokenizer);
+    }
+    
+    for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) {
+      doTestTokenFilter(tokenFilter);
+    }
+    
+    for (String charFilter : CharFilterFactory.availableCharFilters()) {
+      doTestCharFilter(charFilter);
+    }
+  }
+  
+  private void doTestTokenizer(String tokenizer) throws IOException {
+    Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer);
+    TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
+    if (factory != null) {
+      // we managed to fully create an instance. check a few more things:
+      
+      // if it implements MultiTermAware, sanity check its impl
+      if (factory instanceof MultiTermAwareComponent) {
+        AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
+        assertNotNull(mtc);
+        // it's not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
+        assertFalse(mtc instanceof CharFilterFactory);
+      }
+      
+      // beast it just a little, it shouldnt throw exceptions:
+      // (it should have thrown them in initialize)
+      Analyzer a = new FactoryAnalyzer(factory, null, null);
+      checkRandomData(random(), a, 20, 20, false, false);
+      a.close();
+    }
+  }
+  
+  private void doTestTokenFilter(String tokenfilter) throws IOException {
+    Class<? extends TokenFilterFactory> factoryClazz = TokenFilterFactory.lookupClass(tokenfilter);
+    TokenFilterFactory factory = (TokenFilterFactory) initialize(factoryClazz);
+    if (factory != null) {
+      // we managed to fully create an instance. check a few more things:
+      
+      // if it implements MultiTermAware, sanity check its impl
+      if (factory instanceof MultiTermAwareComponent) {
+        AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
+        assertNotNull(mtc);
+        // it's not ok to return a charfilter or tokenizer here, this makes no sense
+        assertTrue(mtc instanceof TokenFilterFactory);
+      }
+      
+      // beast it just a little, it shouldnt throw exceptions:
+      // (it should have thrown them in initialize)
+      Analyzer a = new FactoryAnalyzer(assertingTokenizer, factory, null);
+      checkRandomData(random(), a, 20, 20, false, false);
+      a.close();
+    }
+  }
+  
+  private void doTestCharFilter(String charfilter) throws IOException {
+    Class<? extends CharFilterFactory> factoryClazz = CharFilterFactory.lookupClass(charfilter);
+    CharFilterFactory factory = (CharFilterFactory) initialize(factoryClazz);
+    if (factory != null) {
+      // we managed to fully create an instance. check a few more things:
+      
+      // if it implements MultiTermAware, sanity check its impl
+      if (factory instanceof MultiTermAwareComponent) {
+        AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
+        assertNotNull(mtc);
+        // it's not ok to return a tokenizer or tokenfilter here, this makes no sense
+        assertTrue(mtc instanceof CharFilterFactory);
+      }
+      
+      // beast it just a little, it shouldnt throw exceptions:
+      // (it should have thrown them in initialize)
+      Analyzer a = new FactoryAnalyzer(assertingTokenizer, null, factory);
+      checkRandomData(random(), a, 20, 20, false, false);
+      a.close();
+    }
+  }
+  
+  /** tries to initialize a factory with no arguments */
+  private AbstractAnalysisFactory initialize(Class<? extends AbstractAnalysisFactory> factoryClazz) throws IOException {
+    Map<String,String> args = new HashMap<>();
+    args.put("luceneMatchVersion", Version.LATEST.toString());
+    Constructor<? extends AbstractAnalysisFactory> ctor;
+    try {
+      ctor = factoryClazz.getConstructor(Map.class);
+    } catch (Exception e) {
+      throw new RuntimeException("factory '" + factoryClazz + "' does not have a proper ctor!");
+    }
+    
+    AbstractAnalysisFactory factory = null;
+    try {
+      factory = ctor.newInstance(args);
+    } catch (InstantiationException | IllegalAccessException e) {
+      throw new RuntimeException(e);
+    } catch (InvocationTargetException e) {
+      if (e.getCause() instanceof IllegalArgumentException) {
+        // it's ok if we dont provide the right parameters to throw this
+        return null;
+      }
+    }
+    
+    if (factory instanceof ResourceLoaderAware) {
+      try {
+        ((ResourceLoaderAware) factory).inform(new StringMockResourceLoader(""));
+      } catch (IOException ignored) {
+        // it's ok if the right files arent available or whatever to throw this
+      } catch (IllegalArgumentException ignored) {
+        // is this ok? I guess so
+      }
+    }
+    return factory;
+  }
+  
+  // some silly classes just so we can use checkRandomData
+  private TokenizerFactory assertingTokenizer = new TokenizerFactory(new HashMap<String,String>()) {
+    @Override
+    public MockTokenizer create(AttributeFactory factory) {
+      return new MockTokenizer(factory);
+    }
+  };
+  
+  private static class FactoryAnalyzer extends Analyzer {
+    final TokenizerFactory tokenizer;
+    final CharFilterFactory charFilter;
+    final TokenFilterFactory tokenfilter;
+    
+    FactoryAnalyzer(TokenizerFactory tokenizer, TokenFilterFactory tokenfilter, CharFilterFactory charFilter) {
+      assert tokenizer != null;
+      this.tokenizer = tokenizer;
+      this.charFilter = charFilter;
+      this.tokenfilter = tokenfilter;
+    }
+
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName) {
+      Tokenizer tf = tokenizer.create(newAttributeFactory());
+      if (tokenfilter != null) {
+        return new TokenStreamComponents(tf, tokenfilter.create(tf));
+      } else {
+        return new TokenStreamComponents(tf);
+      }
+    }
+
+    @Override
+    protected Reader initReader(String fieldName, Reader reader) {
+      if (charFilter != null) {
+        return charFilter.create(reader);
+      } else {
+        return reader;
+      }
+    }
+  }
+}