mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-17 10:25:15 +00:00
Adds pattern keyword marker filter support (#23600)
This commit adds support for the pattern keyword marker filter in Lucene. Previously, the keyword marker filter in Elasticsearch supported specifying a keywords set or a path to a set of keywords. This commit exposes the regular expression pattern based keyword marker filter also available in Lucene, so that any token matching the pattern specified by the `keywords_pattern` setting is excluded from being stemmed by any stemming filters. Closes #4877
This commit is contained in:
parent
cced2cea5c
commit
2120086d82
@ -21,32 +21,70 @@ package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* A factory for creating keyword marker token filters that prevent tokens from
|
||||
* being modified by stemmers. Two types of keyword marker filters are available:
|
||||
* the {@link SetKeywordMarkerFilter} and the {@link PatternKeywordMarkerFilter}.
|
||||
*
|
||||
* The {@link SetKeywordMarkerFilter} uses a set of keywords to denote which tokens
|
||||
* should be excluded from stemming. This filter is created if the settings include
|
||||
* {@code keywords}, which contains the list of keywords, or {@code `keywords_path`},
|
||||
* which contains a path to a file in the config directory with the keywords.
|
||||
*
|
||||
* The {@link PatternKeywordMarkerFilter} uses a regular expression pattern to match
|
||||
* against tokens that should be excluded from stemming. This filter is created if
|
||||
* the settings include {@code keywords_pattern}, which contains the regular expression
|
||||
* to match against.
|
||||
*/
|
||||
public class KeywordMarkerTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private final CharArraySet keywordLookup;
|
||||
private final Pattern keywordPattern;
|
||||
|
||||
public KeywordMarkerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
||||
boolean ignoreCase =
|
||||
settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "ignore_case", false, deprecationLogger);
|
||||
Set<?> rules = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "keywords");
|
||||
if (rules == null) {
|
||||
throw new IllegalArgumentException("keyword filter requires either `keywords` or `keywords_path` to be configured");
|
||||
String patternString = settings.get("keywords_pattern");
|
||||
if (patternString != null) {
|
||||
// a pattern for matching keywords is specified, as opposed to a
|
||||
// set of keyword strings to match against
|
||||
if (settings.get("keywords") != null || settings.get("keywords_path") != null) {
|
||||
throw new IllegalArgumentException(
|
||||
"cannot specify both `keywords_pattern` and `keywords` or `keywords_path`");
|
||||
}
|
||||
keywordPattern = Pattern.compile(patternString);
|
||||
keywordLookup = null;
|
||||
} else {
|
||||
Set<?> rules = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "keywords");
|
||||
if (rules == null) {
|
||||
throw new IllegalArgumentException(
|
||||
"keyword filter requires either `keywords`, `keywords_path`, " +
|
||||
"or `keywords_pattern` to be configured");
|
||||
}
|
||||
// a set of keywords (or a path to them) is specified
|
||||
keywordLookup = new CharArraySet(rules, ignoreCase);
|
||||
keywordPattern = null;
|
||||
}
|
||||
keywordLookup = new CharArraySet(rules, ignoreCase);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new SetKeywordMarkerFilter(tokenStream, keywordLookup);
|
||||
if (keywordPattern != null) {
|
||||
return new PatternKeywordMarkerFilter(tokenStream, keywordPattern);
|
||||
} else {
|
||||
return new SetKeywordMarkerFilter(tokenStream, keywordLookup);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,103 @@
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.test.ESTestCase.TestAnalysis;
|
||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
|
||||
/**
|
||||
* Tests for the {@link KeywordMarkerTokenFilterFactory} class.
|
||||
*/
|
||||
public class KeywordMarkerFilterFactoryTests extends ESTokenStreamTestCase {
|
||||
|
||||
/**
|
||||
* Tests using a keyword set for the keyword marker filter.
|
||||
*/
|
||||
public void testKeywordSet() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.my_keyword.type", "keyword_marker")
|
||||
.put("index.analysis.filter.my_keyword.keywords", "running, sleeping")
|
||||
.put("index.analysis.analyzer.my_keyword.type", "custom")
|
||||
.put("index.analysis.analyzer.my_keyword.tokenizer", "standard")
|
||||
.put("index.analysis.analyzer.my_keyword.filter", "my_keyword, porter_stem")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keyword");
|
||||
assertThat(tokenFilter, instanceOf(KeywordMarkerTokenFilterFactory.class));
|
||||
TokenStream filter = tokenFilter.create(new WhitespaceTokenizer());
|
||||
assertThat(filter, instanceOf(SetKeywordMarkerFilter.class));
|
||||
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_keyword");
|
||||
// jogging is not part of the keywords set, so verify that its the only stemmed word
|
||||
assertAnalyzesTo(analyzer, "running jogging sleeping",
|
||||
new String[] { "running", "jog", "sleeping" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests using a regular expression pattern for the keyword marker filter.
|
||||
*/
|
||||
public void testKeywordPattern() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.my_keyword.type", "keyword_marker")
|
||||
.put("index.analysis.filter.my_keyword.keywords_pattern", "run[a-z]ing")
|
||||
.put("index.analysis.analyzer.my_keyword.type", "custom")
|
||||
.put("index.analysis.analyzer.my_keyword.tokenizer", "standard")
|
||||
.put("index.analysis.analyzer.my_keyword.filter", "my_keyword, porter_stem")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keyword");
|
||||
assertThat(tokenFilter, instanceOf(KeywordMarkerTokenFilterFactory.class));
|
||||
TokenStream filter = tokenFilter.create(new WhitespaceTokenizer());
|
||||
assertThat(filter, instanceOf(PatternKeywordMarkerFilter.class));
|
||||
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_keyword");
|
||||
// running should match the pattern, so it should not be stemmed but sleeping should
|
||||
assertAnalyzesTo(analyzer, "running sleeping", new String[] { "running", "sleep" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that both keywords and patterns cannot be specified together.
|
||||
*/
|
||||
public void testCannotSpecifyBothKeywordsAndPattern() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.my_keyword.type", "keyword_marker")
|
||||
.put("index.analysis.filter.my_keyword.keywords", "running")
|
||||
.put("index.analysis.filter.my_keyword.keywords_pattern", "run[a-z]ing")
|
||||
.put("index.analysis.analyzer.my_keyword.type", "custom")
|
||||
.put("index.analysis.analyzer.my_keyword.tokenizer", "standard")
|
||||
.put("index.analysis.analyzer.my_keyword.filter", "my_keyword, porter_stem")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings));
|
||||
assertEquals("cannot specify both `keywords_pattern` and `keywords` or `keywords_path`",
|
||||
e.getMessage());
|
||||
}
|
||||
}
|
@ -12,6 +12,9 @@ any stemming filters.
|
||||
|`keywords_path` |A path (either relative to `config` location, or
|
||||
absolute) to a list of words.
|
||||
|
||||
|`keywords_pattern` |A regular expression pattern to match against words
|
||||
in the text.
|
||||
|
||||
|`ignore_case` |Set to `true` to lower case all words first. Defaults to
|
||||
`false`.
|
||||
|=======================================================================
|
||||
|
Loading…
x
Reference in New Issue
Block a user