diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4a99bce9244..8f3e37f4cdd 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -25,6 +25,9 @@ Bug Fixes * LUCENE-7533: Classic query parser: disallow autoGeneratePhraseQueries=true when splitOnWhitespace=false (and vice-versa). (Steve Rowe) +* LUCENE-7536: ASCIIFoldingFilterFactory used to return an illegal multi-term + component when preserveOriginal was set to true. (Adrien Grand) + Improvements * LUCENE-7532: Add back lost codec file format documentation diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java index 60dddff6bd2..4e64abe3c22 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.miscellaneous; +import java.util.HashMap; import java.util.Map; import org.apache.lucene.analysis.util.AbstractAnalysisFactory; @@ -36,12 +37,14 @@ import org.apache.lucene.analysis.TokenStream; * </fieldType> */ public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + private static final String PRESERVE_ORIGINAL = "preserveOriginal"; + private final boolean preserveOriginal; /** Creates a new ASCIIFoldingFilterFactory */ public ASCIIFoldingFilterFactory(Map args) { super(args); - preserveOriginal = getBoolean(args, "preserveOriginal", false); + preserveOriginal = getBoolean(args, PRESERVE_ORIGINAL, false); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -54,7 +57,17 @@ public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements Mul @Override public AbstractAnalysisFactory getMultiTermComponent() { - return this; + if (preserveOriginal) { + // The main use-case for using preserveOriginal is to match regardless of + // case but to give better scores to exact matches. Since most multi-term + // queries return constant scores anyway, the multi-term component only + // emits the folded token + Map args = new HashMap<>(getOriginalArgs()); + args.remove(PRESERVE_ORIGINAL); + return new ASCIIFoldingFilterFactory(args); + } else { + return this; + } } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestAsciiFoldingFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestAsciiFoldingFilterFactory.java new file mode 100644 index 00000000000..87d87607c2d --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestAsciiFoldingFilterFactory.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; + +import org.apache.lucene.analysis.CannedTokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +public class TestAsciiFoldingFilterFactory extends BaseTokenStreamFactoryTestCase { + + public void testMultiTermAnalysis() throws IOException { + TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap()); + TokenStream stream = new CannedTokenStream(new Token("Été", 0, 3)); + stream = factory.create(stream); + assertTokenStreamContents(stream, new String[] { "Ete" }); + + factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent(); + stream = new CannedTokenStream(new Token("Été", 0, 3)); + stream = factory.create(stream); + assertTokenStreamContents(stream, new String[] { "Ete" }); + + factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true"))); + stream = new CannedTokenStream(new Token("Été", 0, 3)); + stream = factory.create(stream); + assertTokenStreamContents(stream, new String[] { "Ete", "Été" }); + + factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent(); + stream = new CannedTokenStream(new Token("Été", 0, 3)); + stream = factory.create(stream); + assertTokenStreamContents(stream, new String[] { "Ete" }); + } + +}