mirror of https://github.com/apache/lucene.git
LUCENE-7536: ASCIIFoldingFilterFactory.getMultiTermComponent can emit two tokens.
This commit is contained in:
parent
6d1962a902
commit
0e50101ee3
|
@ -25,6 +25,9 @@ Bug Fixes
|
|||
* LUCENE-7533: Classic query parser: disallow autoGeneratePhraseQueries=true
|
||||
when splitOnWhitespace=false (and vice-versa). (Steve Rowe)
|
||||
|
||||
* LUCENE-7536: ASCIIFoldingFilterFactory used to return an illegal multi-term
|
||||
component when preserveOriginal was set to true. (Adrien Grand)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7532: Add back lost codec file format documentation
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
|
@ -36,12 +37,14 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
* </fieldType></pre>
|
||||
*/
|
||||
public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
private static final String PRESERVE_ORIGINAL = "preserveOriginal";
|
||||
|
||||
private final boolean preserveOriginal;
|
||||
|
||||
/** Creates a new ASCIIFoldingFilterFactory */
|
||||
public ASCIIFoldingFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
preserveOriginal = getBoolean(args, "preserveOriginal", false);
|
||||
preserveOriginal = getBoolean(args, PRESERVE_ORIGINAL, false);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -54,7 +57,17 @@ public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements Mul
|
|||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
if (preserveOriginal) {
|
||||
// The main use-case for using preserveOriginal is to match regardless of
|
||||
// case but to give better scores to exact matches. Since most multi-term
|
||||
// queries return constant scores anyway, the multi-term component only
|
||||
// emits the folded token
|
||||
Map<String, String> args = new HashMap<>(getOriginalArgs());
|
||||
args.remove(PRESERVE_ORIGINAL);
|
||||
return new ASCIIFoldingFilterFactory(args);
|
||||
} else {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
public class TestAsciiFoldingFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
||||
public void testMultiTermAnalysis() throws IOException {
|
||||
TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap());
|
||||
TokenStream stream = new CannedTokenStream(new Token("Été", 0, 3));
|
||||
stream = factory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "Ete" });
|
||||
|
||||
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||
stream = new CannedTokenStream(new Token("Été", 0, 3));
|
||||
stream = factory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "Ete" });
|
||||
|
||||
factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
|
||||
stream = new CannedTokenStream(new Token("Été", 0, 3));
|
||||
stream = factory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "Ete", "Été" });
|
||||
|
||||
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||
stream = new CannedTokenStream(new Token("Été", 0, 3));
|
||||
stream = factory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "Ete" });
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue