LUCENE-7536: ASCIIFoldingFilterFactory.getMultiTermComponent can emit two tokens.

This commit is contained in:
Adrien Grand 2016-11-18 10:06:16 +01:00
parent 6d1962a902
commit 0e50101ee3
3 changed files with 72 additions and 2 deletions

View File

@ -25,6 +25,9 @@ Bug Fixes
* LUCENE-7533: Classic query parser: disallow autoGeneratePhraseQueries=true * LUCENE-7533: Classic query parser: disallow autoGeneratePhraseQueries=true
when splitOnWhitespace=false (and vice-versa). (Steve Rowe) when splitOnWhitespace=false (and vice-versa). (Steve Rowe)
* LUCENE-7536: ASCIIFoldingFilterFactory used to return an illegal multi-term
component when preserveOriginal was set to true. (Adrien Grand)
Improvements Improvements
* LUCENE-7532: Add back lost codec file format documentation * LUCENE-7532: Add back lost codec file format documentation

View File

@ -17,6 +17,7 @@
package org.apache.lucene.analysis.miscellaneous; package org.apache.lucene.analysis.miscellaneous;
import java.util.HashMap;
import java.util.Map; import java.util.Map;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory; import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
@ -36,12 +37,14 @@ import org.apache.lucene.analysis.TokenStream;
* &lt;/fieldType&gt;</pre> * &lt;/fieldType&gt;</pre>
*/ */
public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
private static final String PRESERVE_ORIGINAL = "preserveOriginal";
private final boolean preserveOriginal; private final boolean preserveOriginal;
/** Creates a new ASCIIFoldingFilterFactory */ /** Creates a new ASCIIFoldingFilterFactory */
public ASCIIFoldingFilterFactory(Map<String,String> args) { public ASCIIFoldingFilterFactory(Map<String,String> args) {
super(args); super(args);
preserveOriginal = getBoolean(args, "preserveOriginal", false); preserveOriginal = getBoolean(args, PRESERVE_ORIGINAL, false);
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -54,7 +57,17 @@ public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements Mul
@Override @Override
public AbstractAnalysisFactory getMultiTermComponent() { public AbstractAnalysisFactory getMultiTermComponent() {
if (preserveOriginal) {
// The main use-case for using preserveOriginal is to match regardless of
// case but to give better scores to exact matches. Since most multi-term
// queries return constant scores anyway, the multi-term component only
// emits the folded token
Map<String, String> args = new HashMap<>(getOriginalArgs());
args.remove(PRESERVE_ORIGINAL);
return new ASCIIFoldingFilterFactory(args);
} else {
return this; return this;
} }
} }
}

View File

@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
public class TestAsciiFoldingFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testMultiTermAnalysis() throws IOException {
TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap());
TokenStream stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete", "Été" });
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
}
}