From 0555fef799c0ce070f5daf946993eaa3fe750207 Mon Sep 17 00:00:00 2001 From: malpani Date: Tue, 21 Jul 2020 08:11:11 -0700 Subject: [PATCH] Support ignore_keywords flag for word delimiter graph token filter (#59563) This commit allows customizing the word delimiter token filters to skip processing tokens tagged as keyword through the `ignore_keywords` flag Lucene's WordDelimiterGraphFilter already exposes. Fix for #59491 --- .../word-delimiter-graph-tokenfilter.asciidoc | 8 ++++- .../WordDelimiterGraphTokenFilterFactory.java | 3 ++ ...DelimiterGraphTokenFilterFactoryTests.java | 32 +++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc index 2fa9c41ad79..3858c5eeb0f 100644 --- a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc @@ -270,6 +270,12 @@ If `true`, the filter includes tokens consisting of only alphabetical characters in the output. If `false`, the filter excludes these tokens from the output. Defaults to `true`. +`ignore_keywords`:: +(Optional, boolean) +If `true`, the filter skips tokens with +a `keyword` attribute of `true`. +Defaults to `false`. + [[word-delimiter-graph-tokenfilter-preserve-original]] `preserve_original`:: + @@ -496,4 +502,4 @@ spans one in the token graph, making it invalid. image::images/analysis/token-graph-wd.svg[align="center"] -==== \ No newline at end of file +==== diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java index e64bb96bfcf..7dd8e88ccbc 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java @@ -41,6 +41,7 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter. import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS; +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.IGNORE_KEYWORDS; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS; @@ -93,6 +94,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited + flags |= getFlag(IGNORE_KEYWORDS, settings, "ignore_keywords", false); + // If set, suppresses processing terms with KeywordAttribute#isKeyword()=true. Set protectedWords = Analysis.getWordSet(env, settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java index ec61d614db9..a027781f022 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java @@ -118,6 +118,38 @@ public class WordDelimiterGraphTokenFilterFactoryTests expectedIncr, expectedPosLen, null); } + public void testIgnoreKeywords() throws IOException { + //test with keywords but ignore is false (default behavior) + Settings settings = Settings.builder() + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") + .put("index.analysis.filter.my_keyword.type", "keyword_marker") + .put("index.analysis.filter.my_keyword.keywords", "PowerHungry") + .put("index.analysis.analyzer.my_analyzer.type", "custom") + .put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace") + .put("index.analysis.analyzer.my_analyzer.filter", "my_keyword, my_word_delimiter") + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); + String source = "PowerShot PowerHungry"; + int[] expectedStartOffsets = new int[]{0, 5, 10, 15}; + int[] expectedEndOffsets = new int[]{5, 9, 15, 21}; + String[] expected = new String[]{"Power", "Shot", "Power", "Hungry"}; + NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer"); + assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets); + + //test with keywords but ignore_keywords is set as true + settings = Settings.builder().put(settings) + .put("index.analysis.filter.my_word_delimiter.ignore_keywords", "true") + .build(); + analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); + analyzer = analysis.indexAnalyzers.get("my_analyzer"); + expectedStartOffsets = new int[]{0, 5, 10}; + expectedEndOffsets = new int[]{5, 9, 21}; + expected = new String[]{"Power", "Shot", "PowerHungry"}; + assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets); + } + public void testPreconfiguredFilter() throws IOException { // Before 7.3 we don't adjust offsets {