Support ignore_keywords flag for word delimiter graph token filter (#59563)

This commit allows customizing the word delimiter token filters to skip processing 
tokens tagged as keyword through the `ignore_keywords` flag Lucene's 
WordDelimiterGraphFilter already exposes.

Fix for #59491
This commit is contained in:
malpani 2020-07-21 08:11:11 -07:00 committed by Alan Woodward
parent a0ad1a196b
commit 0555fef799
3 changed files with 42 additions and 1 deletions

View File

@ -270,6 +270,12 @@ If `true`, the filter includes tokens consisting of only alphabetical characters
in the output. If `false`, the filter excludes these tokens from the output.
Defaults to `true`.
`ignore_keywords`::
(Optional, boolean)
If `true`, the filter skips tokens with
a `keyword` attribute of `true`.
Defaults to `false`.
[[word-delimiter-graph-tokenfilter-preserve-original]]
`preserve_original`::
+
@ -496,4 +502,4 @@ spans one in the token graph, making it invalid.
image::images/analysis/token-graph-wd.svg[align="center"]
====
====

View File

@ -41,6 +41,7 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.IGNORE_KEYWORDS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;
@ -93,6 +94,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
// If not null is the set of tokens to protect from being delimited
flags |= getFlag(IGNORE_KEYWORDS, settings, "ignore_keywords", false);
// If set, suppresses processing terms with KeywordAttribute#isKeyword()=true.
Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
this.flags = flags;

View File

@ -118,6 +118,38 @@ public class WordDelimiterGraphTokenFilterFactoryTests
expectedIncr, expectedPosLen, null);
}
public void testIgnoreKeywords() throws IOException {
//test with keywords but ignore is false (default behavior)
Settings settings = Settings.builder()
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.put("index.analysis.filter.my_keyword.type", "keyword_marker")
.put("index.analysis.filter.my_keyword.keywords", "PowerHungry")
.put("index.analysis.analyzer.my_analyzer.type", "custom")
.put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
.put("index.analysis.analyzer.my_analyzer.filter", "my_keyword, my_word_delimiter")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
String source = "PowerShot PowerHungry";
int[] expectedStartOffsets = new int[]{0, 5, 10, 15};
int[] expectedEndOffsets = new int[]{5, 9, 15, 21};
String[] expected = new String[]{"Power", "Shot", "Power", "Hungry"};
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
//test with keywords but ignore_keywords is set as true
settings = Settings.builder().put(settings)
.put("index.analysis.filter.my_word_delimiter.ignore_keywords", "true")
.build();
analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
analyzer = analysis.indexAnalyzers.get("my_analyzer");
expectedStartOffsets = new int[]{0, 5, 10};
expectedEndOffsets = new int[]{5, 9, 21};
expected = new String[]{"Power", "Shot", "PowerHungry"};
assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
}
public void testPreconfiguredFilter() throws IOException {
// Before 7.3 we don't adjust offsets
{