Support ignore_keywords flag for word delimiter graph token filter (#59563)

This commit allows customizing the word delimiter token filters to skip processing tokens tagged as keyword through the `ignore_keywords` flag Lucene's WordDelimiterGraphFilter already exposes. Fix for #59491
2025-03-25 01:19:02 +00:00 · 2020-07-21 08:11:11 -07:00 · 2020-07-21 08:11:11 -07:00 · 0555fef799
commit 0555fef799
parent a0ad1a196b
3 changed files with 42 additions and 1 deletions
--- a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc
@ -270,6 +270,12 @@ If `true`, the filter includes tokens consisting of only alphabetical characters
 in the output. If `false`, the filter excludes these tokens from the output.
 Defaults to `true`.

+`ignore_keywords`::
+(Optional, boolean)
+If `true`, the filter skips tokens with
+a `keyword` attribute of `true`.
+Defaults to `false`.
+
 [[word-delimiter-graph-tokenfilter-preserve-original]]
 `preserve_original`::
 +
@ -496,4 +502,4 @@ spans one in the token graph, making it invalid.

 image::images/analysis/token-graph-wd.svg[align="center"]

-====
+====
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java
@ -41,6 +41,7 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.IGNORE_KEYWORDS;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;
@ -93,6 +94,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
        // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
        flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
        // If not null is the set of tokens to protect from being delimited
+        flags |= getFlag(IGNORE_KEYWORDS, settings, "ignore_keywords", false);
+        // If set, suppresses processing terms with KeywordAttribute#isKeyword()=true.
        Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
        this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
        this.flags = flags;
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java
@ -118,6 +118,38 @@ public class WordDelimiterGraphTokenFilterFactoryTests
            expectedIncr, expectedPosLen, null);
    }

+    public void testIgnoreKeywords() throws IOException {
+        //test with keywords but ignore is false (default behavior)
+        Settings settings = Settings.builder()
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
+                .put("index.analysis.filter.my_keyword.type", "keyword_marker")
+                .put("index.analysis.filter.my_keyword.keywords", "PowerHungry")
+                .put("index.analysis.analyzer.my_analyzer.type", "custom")
+                .put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
+                .put("index.analysis.analyzer.my_analyzer.filter", "my_keyword, my_word_delimiter")
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .build();
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
+        String source = "PowerShot PowerHungry";
+        int[] expectedStartOffsets = new int[]{0, 5, 10, 15};
+        int[] expectedEndOffsets = new int[]{5, 9, 15, 21};
+        String[] expected = new String[]{"Power", "Shot", "Power", "Hungry"};
+        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
+        assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
+
+        //test with keywords but ignore_keywords is set as true
+        settings = Settings.builder().put(settings)
+                .put("index.analysis.filter.my_word_delimiter.ignore_keywords", "true")
+                .build();
+        analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
+        analyzer = analysis.indexAnalyzers.get("my_analyzer");
+        expectedStartOffsets = new int[]{0, 5, 10};
+        expectedEndOffsets = new int[]{5, 9, 21};
+        expected = new String[]{"Power", "Shot", "PowerHungry"};
+        assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
+    }
+
    public void testPreconfiguredFilter() throws IOException {
        // Before 7.3 we don't adjust offsets
        {