mirror of https://github.com/apache/lucene.git
LUCENE-7536: ASCIIFoldingFilterFactory.getMultiTermComponent can emit two tokens.
This commit is contained in:
parent
6d1962a902
commit
0e50101ee3
|
@ -25,6 +25,9 @@ Bug Fixes
|
||||||
* LUCENE-7533: Classic query parser: disallow autoGeneratePhraseQueries=true
|
* LUCENE-7533: Classic query parser: disallow autoGeneratePhraseQueries=true
|
||||||
when splitOnWhitespace=false (and vice-versa). (Steve Rowe)
|
when splitOnWhitespace=false (and vice-versa). (Steve Rowe)
|
||||||
|
|
||||||
|
* LUCENE-7536: ASCIIFoldingFilterFactory used to return an illegal multi-term
|
||||||
|
component when preserveOriginal was set to true. (Adrien Grand)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-7532: Add back lost codec file format documentation
|
* LUCENE-7532: Add back lost codec file format documentation
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
package org.apache.lucene.analysis.miscellaneous;
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||||
|
@ -36,12 +37,14 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*/
|
*/
|
||||||
public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||||
|
private static final String PRESERVE_ORIGINAL = "preserveOriginal";
|
||||||
|
|
||||||
private final boolean preserveOriginal;
|
private final boolean preserveOriginal;
|
||||||
|
|
||||||
/** Creates a new ASCIIFoldingFilterFactory */
|
/** Creates a new ASCIIFoldingFilterFactory */
|
||||||
public ASCIIFoldingFilterFactory(Map<String,String> args) {
|
public ASCIIFoldingFilterFactory(Map<String,String> args) {
|
||||||
super(args);
|
super(args);
|
||||||
preserveOriginal = getBoolean(args, "preserveOriginal", false);
|
preserveOriginal = getBoolean(args, PRESERVE_ORIGINAL, false);
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
@ -54,7 +57,17 @@ public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements Mul
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||||
|
if (preserveOriginal) {
|
||||||
|
// The main use-case for using preserveOriginal is to match regardless of
|
||||||
|
// case but to give better scores to exact matches. Since most multi-term
|
||||||
|
// queries return constant scores anyway, the multi-term component only
|
||||||
|
// emits the folded token
|
||||||
|
Map<String, String> args = new HashMap<>(getOriginalArgs());
|
||||||
|
args.remove(PRESERVE_ORIGINAL);
|
||||||
|
return new ASCIIFoldingFilterFactory(args);
|
||||||
|
} else {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CannedTokenStream;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
public class TestAsciiFoldingFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
|
|
||||||
|
public void testMultiTermAnalysis() throws IOException {
|
||||||
|
TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap());
|
||||||
|
TokenStream stream = new CannedTokenStream(new Token("Été", 0, 3));
|
||||||
|
stream = factory.create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "Ete" });
|
||||||
|
|
||||||
|
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||||
|
stream = new CannedTokenStream(new Token("Été", 0, 3));
|
||||||
|
stream = factory.create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "Ete" });
|
||||||
|
|
||||||
|
factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
|
||||||
|
stream = new CannedTokenStream(new Token("Été", 0, 3));
|
||||||
|
stream = factory.create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "Ete", "Été" });
|
||||||
|
|
||||||
|
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||||
|
stream = new CannedTokenStream(new Token("Été", 0, 3));
|
||||||
|
stream = factory.create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "Ete" });
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue