From 257ea3423f5b03495e08a8aa4263f9520deb662b Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Mon, 10 Oct 2016 11:37:23 -0400 Subject: [PATCH] LUCENE-7476: JapaneseNumberFilter should not invoke incrementToken on its input after it's exhausted --- lucene/CHANGES.txt | 3 + .../analysis/ja/JapaneseNumberFilter.java | 11 + .../lucene/analysis/ja/TestFactories.java | 203 ++++++++++++++++++ 3 files changed, 217 insertions(+) create mode 100644 lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 3fc306ecc72..0647df016f2 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -67,6 +67,9 @@ Bug Fixes * LUCENE-7484: FastVectorHighlighter failed to highlight SynonymQuery (Jim Ferenczi via Mike McCandless) +* LUCENE-7476: JapaneseNumberFilter should not invoke incrementToken + on its input after it's exhausted (Andy Hind via Mike McCandless) + Improvements * LUCENE-7439: FuzzyQuery now matches all terms within the specified diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java index a1af95e271b..9f4c1d555df 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java @@ -104,6 +104,8 @@ public class JapaneseNumberFilter extends TokenFilter { private StringBuilder numeral; private int fallThroughTokens; + + private boolean exhausted = false; static { numerals = new char[0x10000]; @@ -149,7 +151,12 @@ public class JapaneseNumberFilter extends TokenFilter { return true; } + if (exhausted) { + return false; + } + if (!input.incrementToken()) { + exhausted = true; return false; } @@ -184,6 +191,9 @@ public class JapaneseNumberFilter extends TokenFilter { endOffset = offsetAttr.endOffset(); moreTokens = input.incrementToken(); + if (moreTokens == false) { + exhausted = true; + } if (posIncrAttr.getPositionIncrement() == 0) { // This token is a stacked/synonym token, capture number of tokens "under" this token, @@ -227,6 +237,7 @@ public class JapaneseNumberFilter extends TokenFilter { fallThroughTokens = 0; numeral = new StringBuilder(); state = null; + exhausted = false; } /** diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java new file mode 100644 index 00000000000..bae8ffc8f53 --- /dev/null +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestFactories.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja; + + +import java.io.IOException; +import java.io.Reader; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.CharFilterFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.analysis.util.TokenizerFactory; +import org.apache.lucene.util.AttributeFactory; +import org.apache.lucene.util.Version; + +/** + * Sanity check some things about all factories, + * we do our best to see if we can sanely initialize it with + * no parameters and smoke test it, etc. + */ +// TODO: this was copied from the analysis/common module ... find a better way to share it! + +// TODO: fix this to use CustomAnalyzer instead of its own FactoryAnalyzer +public class TestFactories extends BaseTokenStreamTestCase { + public void test() throws IOException { + for (String tokenizer : TokenizerFactory.availableTokenizers()) { + doTestTokenizer(tokenizer); + } + + for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) { + doTestTokenFilter(tokenFilter); + } + + for (String charFilter : CharFilterFactory.availableCharFilters()) { + doTestCharFilter(charFilter); + } + } + + private void doTestTokenizer(String tokenizer) throws IOException { + Class factoryClazz = TokenizerFactory.lookupClass(tokenizer); + TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz); + if (factory != null) { + // we managed to fully create an instance. check a few more things: + + // if it implements MultiTermAware, sanity check its impl + if (factory instanceof MultiTermAwareComponent) { + AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); + assertNotNull(mtc); + // it's not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it + assertFalse(mtc instanceof CharFilterFactory); + } + + // beast it just a little, it shouldnt throw exceptions: + // (it should have thrown them in initialize) + Analyzer a = new FactoryAnalyzer(factory, null, null); + checkRandomData(random(), a, 20, 20, false, false); + a.close(); + } + } + + private void doTestTokenFilter(String tokenfilter) throws IOException { + Class factoryClazz = TokenFilterFactory.lookupClass(tokenfilter); + TokenFilterFactory factory = (TokenFilterFactory) initialize(factoryClazz); + if (factory != null) { + // we managed to fully create an instance. check a few more things: + + // if it implements MultiTermAware, sanity check its impl + if (factory instanceof MultiTermAwareComponent) { + AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); + assertNotNull(mtc); + // it's not ok to return a charfilter or tokenizer here, this makes no sense + assertTrue(mtc instanceof TokenFilterFactory); + } + + // beast it just a little, it shouldnt throw exceptions: + // (it should have thrown them in initialize) + Analyzer a = new FactoryAnalyzer(assertingTokenizer, factory, null); + checkRandomData(random(), a, 20, 20, false, false); + a.close(); + } + } + + private void doTestCharFilter(String charfilter) throws IOException { + Class factoryClazz = CharFilterFactory.lookupClass(charfilter); + CharFilterFactory factory = (CharFilterFactory) initialize(factoryClazz); + if (factory != null) { + // we managed to fully create an instance. check a few more things: + + // if it implements MultiTermAware, sanity check its impl + if (factory instanceof MultiTermAwareComponent) { + AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); + assertNotNull(mtc); + // it's not ok to return a tokenizer or tokenfilter here, this makes no sense + assertTrue(mtc instanceof CharFilterFactory); + } + + // beast it just a little, it shouldnt throw exceptions: + // (it should have thrown them in initialize) + Analyzer a = new FactoryAnalyzer(assertingTokenizer, null, factory); + checkRandomData(random(), a, 20, 20, false, false); + a.close(); + } + } + + /** tries to initialize a factory with no arguments */ + private AbstractAnalysisFactory initialize(Class factoryClazz) throws IOException { + Map args = new HashMap<>(); + args.put("luceneMatchVersion", Version.LATEST.toString()); + Constructor ctor; + try { + ctor = factoryClazz.getConstructor(Map.class); + } catch (Exception e) { + throw new RuntimeException("factory '" + factoryClazz + "' does not have a proper ctor!"); + } + + AbstractAnalysisFactory factory = null; + try { + factory = ctor.newInstance(args); + } catch (InstantiationException | IllegalAccessException e) { + throw new RuntimeException(e); + } catch (InvocationTargetException e) { + if (e.getCause() instanceof IllegalArgumentException) { + // it's ok if we dont provide the right parameters to throw this + return null; + } + } + + if (factory instanceof ResourceLoaderAware) { + try { + ((ResourceLoaderAware) factory).inform(new StringMockResourceLoader("")); + } catch (IOException ignored) { + // it's ok if the right files arent available or whatever to throw this + } catch (IllegalArgumentException ignored) { + // is this ok? I guess so + } + } + return factory; + } + + // some silly classes just so we can use checkRandomData + private TokenizerFactory assertingTokenizer = new TokenizerFactory(new HashMap()) { + @Override + public MockTokenizer create(AttributeFactory factory) { + return new MockTokenizer(factory); + } + }; + + private static class FactoryAnalyzer extends Analyzer { + final TokenizerFactory tokenizer; + final CharFilterFactory charFilter; + final TokenFilterFactory tokenfilter; + + FactoryAnalyzer(TokenizerFactory tokenizer, TokenFilterFactory tokenfilter, CharFilterFactory charFilter) { + assert tokenizer != null; + this.tokenizer = tokenizer; + this.charFilter = charFilter; + this.tokenfilter = tokenfilter; + } + + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tf = tokenizer.create(newAttributeFactory()); + if (tokenfilter != null) { + return new TokenStreamComponents(tf, tokenfilter.create(tf)); + } else { + return new TokenStreamComponents(tf); + } + } + + @Override + protected Reader initReader(String fieldName, Reader reader) { + if (charFilter != null) { + return charFilter.create(reader); + } else { + return reader; + } + } + } +}