Allow TrimFilter to be used in custom normalizers (#27758)

AnalysisFactoryTestCase checks that the ES custom token filter multi-term awareness matches the underlying lucene factory. For the trim filter this won't be the case until LUCENE-8093 is released in 7.3, so we add a temporary exclusion Closes #27310
2025-03-09 14:34:43 +00:00 · 2017-12-18 14:27:03 +00:00 · 2017-12-18 14:27:03 +00:00 · af3f63616b
commit af3f63616b
parent 76771242e8
3 changed files with 55 additions and 1 deletions
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@ -246,7 +246,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
        filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
        // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
        filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
-        filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
        filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
        filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
        filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/TrimTokenFilterTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/TrimTokenFilterTests.java
@ -0,0 +1,48 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisTestsHelper;
+import org.elasticsearch.index.analysis.NamedAnalyzer;
+import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.test.ESTokenStreamTestCase;
+
+import java.io.IOException;
+
+public class TrimTokenFilterTests extends ESTokenStreamTestCase {
+
+    public void testNormalizer() throws IOException {
+        Settings settings = Settings.builder()
+            .putList("index.analysis.normalizer.my_normalizer.filter", "trim")
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .build();
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
+        assertNull(analysis.indexAnalyzers.get("my_normalizer"));
+        NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer");
+        assertNotNull(normalizer);
+        assertEquals("my_normalizer", normalizer.name());
+        assertTokenStreamContents(normalizer.tokenStream("foo", "  bar  "), new String[] {"bar"});
+        assertEquals(new BytesRef("bar"), normalizer.normalize("foo", "  bar  "));
+    }
+
+}
--- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
+++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
@ -22,6 +22,7 @@ package org.elasticsearch.indices.analysis;
 import org.apache.lucene.analysis.util.CharFilterFactory;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.elasticsearch.Version;
 import org.elasticsearch.common.collect.MapBuilder;
 import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
 import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
@ -462,6 +463,11 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {

        Set<Object> classesThatShouldNotHaveMultiTermSupport = new HashSet<>(actual);
        classesThatShouldNotHaveMultiTermSupport.removeAll(expected);
+        classesThatShouldNotHaveMultiTermSupport.remove("token filter [trim]");
+        if (Version.CURRENT.luceneVersion.onOrAfter(org.apache.lucene.util.Version.fromBits(7, 3, 0))) {
+            // TODO: remove the above exclusion when we move to lucene 7.3
+            assert false;
+        }
        assertTrue("Pre-built components should not have multi-term support: " + classesThatShouldNotHaveMultiTermSupport,
                classesThatShouldNotHaveMultiTermSupport.isEmpty());
    }