Support ignore_keywords flag for word delimiter graph token filter (#59563)
This commit allows customizing the word delimiter token filters to skip processing tokens tagged as keyword through the `ignore_keywords` flag Lucene's WordDelimiterGraphFilter already exposes. Fix for #59491
This commit is contained in:
parent
a0ad1a196b
commit
0555fef799
|
@ -270,6 +270,12 @@ If `true`, the filter includes tokens consisting of only alphabetical characters
|
|||
in the output. If `false`, the filter excludes these tokens from the output.
|
||||
Defaults to `true`.
|
||||
|
||||
`ignore_keywords`::
|
||||
(Optional, boolean)
|
||||
If `true`, the filter skips tokens with
|
||||
a `keyword` attribute of `true`.
|
||||
Defaults to `false`.
|
||||
|
||||
[[word-delimiter-graph-tokenfilter-preserve-original]]
|
||||
`preserve_original`::
|
||||
+
|
||||
|
@ -496,4 +502,4 @@ spans one in the token graph, making it invalid.
|
|||
|
||||
image::images/analysis/token-graph-wd.svg[align="center"]
|
||||
|
||||
====
|
||||
====
|
||||
|
|
|
@ -41,6 +41,7 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.
|
|||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.IGNORE_KEYWORDS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;
|
||||
|
@ -93,6 +94,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
|
|||
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
|
||||
// If not null is the set of tokens to protect from being delimited
|
||||
flags |= getFlag(IGNORE_KEYWORDS, settings, "ignore_keywords", false);
|
||||
// If set, suppresses processing terms with KeywordAttribute#isKeyword()=true.
|
||||
Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
|
||||
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
|
||||
this.flags = flags;
|
||||
|
|
|
@ -118,6 +118,38 @@ public class WordDelimiterGraphTokenFilterFactoryTests
|
|||
expectedIncr, expectedPosLen, null);
|
||||
}
|
||||
|
||||
public void testIgnoreKeywords() throws IOException {
|
||||
//test with keywords but ignore is false (default behavior)
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
|
||||
.put("index.analysis.filter.my_keyword.type", "keyword_marker")
|
||||
.put("index.analysis.filter.my_keyword.keywords", "PowerHungry")
|
||||
.put("index.analysis.analyzer.my_analyzer.type", "custom")
|
||||
.put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
|
||||
.put("index.analysis.analyzer.my_analyzer.filter", "my_keyword, my_word_delimiter")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||
String source = "PowerShot PowerHungry";
|
||||
int[] expectedStartOffsets = new int[]{0, 5, 10, 15};
|
||||
int[] expectedEndOffsets = new int[]{5, 9, 15, 21};
|
||||
String[] expected = new String[]{"Power", "Shot", "Power", "Hungry"};
|
||||
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
|
||||
assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
|
||||
|
||||
//test with keywords but ignore_keywords is set as true
|
||||
settings = Settings.builder().put(settings)
|
||||
.put("index.analysis.filter.my_word_delimiter.ignore_keywords", "true")
|
||||
.build();
|
||||
analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||
analyzer = analysis.indexAnalyzers.get("my_analyzer");
|
||||
expectedStartOffsets = new int[]{0, 5, 10};
|
||||
expectedEndOffsets = new int[]{5, 9, 21};
|
||||
expected = new String[]{"Power", "Shot", "PowerHungry"};
|
||||
assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
|
||||
}
|
||||
|
||||
public void testPreconfiguredFilter() throws IOException {
|
||||
// Before 7.3 we don't adjust offsets
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue