LUCENE-8186: LowerCaseTokenizerFactory now lowercases text in multi-term queries.

This commit is contained in:
Adrien Grand 2018-05-28 16:20:49 +02:00
parent 78ca82e63a
commit 1971ef3109
3 changed files with 19 additions and 2 deletions

View File

@ -248,6 +248,9 @@ Bug Fixes
* LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.
(chengpohi via Jim Ferenczi)
* LUCENE-8186: LowerCaseTokenizerFactory now lowercases text in multi-term
queries. (Tim Allison via Adrien Grand)
Other
* LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)

View File

@ -162,6 +162,12 @@ public final class CustomAnalyzer extends Analyzer {
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = in;
// tokenizers can return a tokenfilter if the tokenizer does normalization,
// although this is really bogus/abstraction violation...
if (tokenizer instanceof MultiTermAwareComponent) {
TokenFilterFactory filter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenizer).getMultiTermComponent();
result = filter.create(result);
}
for (TokenFilterFactory filter : tokenFilters) {
if (filter instanceof MultiTermAwareComponent) {
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();

View File

@ -31,9 +31,9 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizerFactory;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
@ -432,7 +432,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return new KeywordTokenizerFactory(getOriginalArgs());
return new DummyTokenFilterFactory(Collections.emptyMap());
}
}
@ -500,6 +500,14 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
.build();
assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c"));
}
/** test normalize where the TokenizerFactory returns a filter to normalize the text */
public void testNormalizationWithLowerCaseTokenizer() throws IOException {
CustomAnalyzer analyzer1 = CustomAnalyzer.builder()
.withTokenizer(LowerCaseTokenizerFactory.class, Collections.emptyMap())
.build();
assertEquals(new BytesRef("abc"), analyzer1.normalize("dummy", "ABC"));
}
public void testConditions() throws IOException {
CustomAnalyzer analyzer = CustomAnalyzer.builder()