mirror of https://github.com/apache/lucene.git
LUCENE-7476: JapaneseNumberFilter should not invoke incrementToken on its input after it's exhausted
This commit is contained in:
parent
9ada6ecb5e
commit
257ea3423f
|
@ -67,6 +67,9 @@ Bug Fixes
|
||||||
* LUCENE-7484: FastVectorHighlighter failed to highlight SynonymQuery
|
* LUCENE-7484: FastVectorHighlighter failed to highlight SynonymQuery
|
||||||
(Jim Ferenczi via Mike McCandless)
|
(Jim Ferenczi via Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-7476: JapaneseNumberFilter should not invoke incrementToken
|
||||||
|
on its input after it's exhausted (Andy Hind via Mike McCandless)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
|
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
|
||||||
|
|
|
@ -105,6 +105,8 @@ public class JapaneseNumberFilter extends TokenFilter {
|
||||||
|
|
||||||
private int fallThroughTokens;
|
private int fallThroughTokens;
|
||||||
|
|
||||||
|
private boolean exhausted = false;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
numerals = new char[0x10000];
|
numerals = new char[0x10000];
|
||||||
for (int i = 0; i < numerals.length; i++) {
|
for (int i = 0; i < numerals.length; i++) {
|
||||||
|
@ -149,7 +151,12 @@ public class JapaneseNumberFilter extends TokenFilter {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (exhausted) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
if (!input.incrementToken()) {
|
if (!input.incrementToken()) {
|
||||||
|
exhausted = true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -184,6 +191,9 @@ public class JapaneseNumberFilter extends TokenFilter {
|
||||||
|
|
||||||
endOffset = offsetAttr.endOffset();
|
endOffset = offsetAttr.endOffset();
|
||||||
moreTokens = input.incrementToken();
|
moreTokens = input.incrementToken();
|
||||||
|
if (moreTokens == false) {
|
||||||
|
exhausted = true;
|
||||||
|
}
|
||||||
|
|
||||||
if (posIncrAttr.getPositionIncrement() == 0) {
|
if (posIncrAttr.getPositionIncrement() == 0) {
|
||||||
// This token is a stacked/synonym token, capture number of tokens "under" this token,
|
// This token is a stacked/synonym token, capture number of tokens "under" this token,
|
||||||
|
@ -227,6 +237,7 @@ public class JapaneseNumberFilter extends TokenFilter {
|
||||||
fallThroughTokens = 0;
|
fallThroughTokens = 0;
|
||||||
numeral = new StringBuilder();
|
numeral = new StringBuilder();
|
||||||
state = null;
|
state = null;
|
||||||
|
exhausted = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -0,0 +1,203 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ja;
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.lang.reflect.Constructor;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||||
|
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||||
|
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||||
|
import org.apache.lucene.util.AttributeFactory;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sanity check some things about all factories,
|
||||||
|
* we do our best to see if we can sanely initialize it with
|
||||||
|
* no parameters and smoke test it, etc.
|
||||||
|
*/
|
||||||
|
// TODO: this was copied from the analysis/common module ... find a better way to share it!
|
||||||
|
|
||||||
|
// TODO: fix this to use CustomAnalyzer instead of its own FactoryAnalyzer
|
||||||
|
public class TestFactories extends BaseTokenStreamTestCase {
|
||||||
|
public void test() throws IOException {
|
||||||
|
for (String tokenizer : TokenizerFactory.availableTokenizers()) {
|
||||||
|
doTestTokenizer(tokenizer);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) {
|
||||||
|
doTestTokenFilter(tokenFilter);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String charFilter : CharFilterFactory.availableCharFilters()) {
|
||||||
|
doTestCharFilter(charFilter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doTestTokenizer(String tokenizer) throws IOException {
|
||||||
|
Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer);
|
||||||
|
TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
|
||||||
|
if (factory != null) {
|
||||||
|
// we managed to fully create an instance. check a few more things:
|
||||||
|
|
||||||
|
// if it implements MultiTermAware, sanity check its impl
|
||||||
|
if (factory instanceof MultiTermAwareComponent) {
|
||||||
|
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||||
|
assertNotNull(mtc);
|
||||||
|
// it's not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
|
||||||
|
assertFalse(mtc instanceof CharFilterFactory);
|
||||||
|
}
|
||||||
|
|
||||||
|
// beast it just a little, it shouldnt throw exceptions:
|
||||||
|
// (it should have thrown them in initialize)
|
||||||
|
Analyzer a = new FactoryAnalyzer(factory, null, null);
|
||||||
|
checkRandomData(random(), a, 20, 20, false, false);
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doTestTokenFilter(String tokenfilter) throws IOException {
|
||||||
|
Class<? extends TokenFilterFactory> factoryClazz = TokenFilterFactory.lookupClass(tokenfilter);
|
||||||
|
TokenFilterFactory factory = (TokenFilterFactory) initialize(factoryClazz);
|
||||||
|
if (factory != null) {
|
||||||
|
// we managed to fully create an instance. check a few more things:
|
||||||
|
|
||||||
|
// if it implements MultiTermAware, sanity check its impl
|
||||||
|
if (factory instanceof MultiTermAwareComponent) {
|
||||||
|
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||||
|
assertNotNull(mtc);
|
||||||
|
// it's not ok to return a charfilter or tokenizer here, this makes no sense
|
||||||
|
assertTrue(mtc instanceof TokenFilterFactory);
|
||||||
|
}
|
||||||
|
|
||||||
|
// beast it just a little, it shouldnt throw exceptions:
|
||||||
|
// (it should have thrown them in initialize)
|
||||||
|
Analyzer a = new FactoryAnalyzer(assertingTokenizer, factory, null);
|
||||||
|
checkRandomData(random(), a, 20, 20, false, false);
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doTestCharFilter(String charfilter) throws IOException {
|
||||||
|
Class<? extends CharFilterFactory> factoryClazz = CharFilterFactory.lookupClass(charfilter);
|
||||||
|
CharFilterFactory factory = (CharFilterFactory) initialize(factoryClazz);
|
||||||
|
if (factory != null) {
|
||||||
|
// we managed to fully create an instance. check a few more things:
|
||||||
|
|
||||||
|
// if it implements MultiTermAware, sanity check its impl
|
||||||
|
if (factory instanceof MultiTermAwareComponent) {
|
||||||
|
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||||
|
assertNotNull(mtc);
|
||||||
|
// it's not ok to return a tokenizer or tokenfilter here, this makes no sense
|
||||||
|
assertTrue(mtc instanceof CharFilterFactory);
|
||||||
|
}
|
||||||
|
|
||||||
|
// beast it just a little, it shouldnt throw exceptions:
|
||||||
|
// (it should have thrown them in initialize)
|
||||||
|
Analyzer a = new FactoryAnalyzer(assertingTokenizer, null, factory);
|
||||||
|
checkRandomData(random(), a, 20, 20, false, false);
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** tries to initialize a factory with no arguments */
|
||||||
|
private AbstractAnalysisFactory initialize(Class<? extends AbstractAnalysisFactory> factoryClazz) throws IOException {
|
||||||
|
Map<String,String> args = new HashMap<>();
|
||||||
|
args.put("luceneMatchVersion", Version.LATEST.toString());
|
||||||
|
Constructor<? extends AbstractAnalysisFactory> ctor;
|
||||||
|
try {
|
||||||
|
ctor = factoryClazz.getConstructor(Map.class);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException("factory '" + factoryClazz + "' does not have a proper ctor!");
|
||||||
|
}
|
||||||
|
|
||||||
|
AbstractAnalysisFactory factory = null;
|
||||||
|
try {
|
||||||
|
factory = ctor.newInstance(args);
|
||||||
|
} catch (InstantiationException | IllegalAccessException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
} catch (InvocationTargetException e) {
|
||||||
|
if (e.getCause() instanceof IllegalArgumentException) {
|
||||||
|
// it's ok if we dont provide the right parameters to throw this
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factory instanceof ResourceLoaderAware) {
|
||||||
|
try {
|
||||||
|
((ResourceLoaderAware) factory).inform(new StringMockResourceLoader(""));
|
||||||
|
} catch (IOException ignored) {
|
||||||
|
// it's ok if the right files arent available or whatever to throw this
|
||||||
|
} catch (IllegalArgumentException ignored) {
|
||||||
|
// is this ok? I guess so
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return factory;
|
||||||
|
}
|
||||||
|
|
||||||
|
// some silly classes just so we can use checkRandomData
|
||||||
|
private TokenizerFactory assertingTokenizer = new TokenizerFactory(new HashMap<String,String>()) {
|
||||||
|
@Override
|
||||||
|
public MockTokenizer create(AttributeFactory factory) {
|
||||||
|
return new MockTokenizer(factory);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
private static class FactoryAnalyzer extends Analyzer {
|
||||||
|
final TokenizerFactory tokenizer;
|
||||||
|
final CharFilterFactory charFilter;
|
||||||
|
final TokenFilterFactory tokenfilter;
|
||||||
|
|
||||||
|
FactoryAnalyzer(TokenizerFactory tokenizer, TokenFilterFactory tokenfilter, CharFilterFactory charFilter) {
|
||||||
|
assert tokenizer != null;
|
||||||
|
this.tokenizer = tokenizer;
|
||||||
|
this.charFilter = charFilter;
|
||||||
|
this.tokenfilter = tokenfilter;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tf = tokenizer.create(newAttributeFactory());
|
||||||
|
if (tokenfilter != null) {
|
||||||
|
return new TokenStreamComponents(tf, tokenfilter.create(tf));
|
||||||
|
} else {
|
||||||
|
return new TokenStreamComponents(tf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
|
if (charFilter != null) {
|
||||||
|
return charFilter.create(reader);
|
||||||
|
} else {
|
||||||
|
return reader;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue