Allow TrimFilter to be used in custom normalizers (#27758)

AnalysisFactoryTestCase checks that the ES custom token filter multi-term
awareness matches the underlying lucene factory.  For the trim filter this
won't be the case until LUCENE-8093 is released in 7.3, so we add a
temporary exclusion

Closes #27310
This commit is contained in:
Alan Woodward 2017-12-18 14:27:03 +00:00 committed by GitHub
parent 76771242e8
commit af3f63616b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 55 additions and 1 deletions

View File

@ -246,7 +246,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
// The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));

View File

@ -0,0 +1,48 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
import java.io.IOException;
public class TrimTokenFilterTests extends ESTokenStreamTestCase {
public void testNormalizer() throws IOException {
Settings settings = Settings.builder()
.putList("index.analysis.normalizer.my_normalizer.filter", "trim")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
assertNull(analysis.indexAnalyzers.get("my_normalizer"));
NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer");
assertNotNull(normalizer);
assertEquals("my_normalizer", normalizer.name());
assertTokenStreamContents(normalizer.tokenStream("foo", " bar "), new String[] {"bar"});
assertEquals(new BytesRef("bar"), normalizer.normalize("foo", " bar "));
}
}

View File

@ -22,6 +22,7 @@ package org.elasticsearch.indices.analysis;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.elasticsearch.Version;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
@ -462,6 +463,11 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
Set<Object> classesThatShouldNotHaveMultiTermSupport = new HashSet<>(actual);
classesThatShouldNotHaveMultiTermSupport.removeAll(expected);
classesThatShouldNotHaveMultiTermSupport.remove("token filter [trim]");
if (Version.CURRENT.luceneVersion.onOrAfter(org.apache.lucene.util.Version.fromBits(7, 3, 0))) {
// TODO: remove the above exclusion when we move to lucene 7.3
assert false;
}
assertTrue("Pre-built components should not have multi-term support: " + classesThatShouldNotHaveMultiTermSupport,
classesThatShouldNotHaveMultiTermSupport.isEmpty());
}