Add exclusion option to `keep_types` token filter (#32012)
Currently the `keep_types` token filter includes all token types specified using its `types` parameter. Lucenes TypeTokenFilter also provides a second mode where instead of keeping the specified tokens (include) they are filtered out (exclude). This change exposes this option as a new `mode` parameter that can either take the values `include` (the default, if not specified) or `exclude`. Closes #29277
This commit is contained in:
parent
6717df3c2d
commit
61486680a2
|
@ -8,8 +8,9 @@ contained in a predefined set.
|
||||||
[float]
|
[float]
|
||||||
=== Options
|
=== Options
|
||||||
[horizontal]
|
[horizontal]
|
||||||
types:: a list of types to keep
|
types:: a list of types to include (default mode) or exclude
|
||||||
|
mode:: if set to `include` (default) the specified token types will be kept,
|
||||||
|
if set to `exclude` the specified token types will be removed from the stream
|
||||||
|
|
||||||
[float]
|
[float]
|
||||||
=== Settings example
|
=== Settings example
|
||||||
|
@ -53,7 +54,7 @@ POST /keep_types_example/_analyze
|
||||||
// CONSOLE
|
// CONSOLE
|
||||||
// TEST[continued]
|
// TEST[continued]
|
||||||
|
|
||||||
And it'd respond:
|
The response will be:
|
||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
@ -72,3 +73,70 @@ And it'd respond:
|
||||||
// TESTRESPONSE
|
// TESTRESPONSE
|
||||||
|
|
||||||
Note how only the `<NUM>` token is in the output.
|
Note how only the `<NUM>` token is in the output.
|
||||||
|
|
||||||
|
=== Exclude mode settings example
|
||||||
|
|
||||||
|
If the `mode` parameter is set to `exclude` like in the following example:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
PUT /keep_types_exclude_example
|
||||||
|
{
|
||||||
|
"settings" : {
|
||||||
|
"analysis" : {
|
||||||
|
"analyzer" : {
|
||||||
|
"my_analyzer" : {
|
||||||
|
"tokenizer" : "standard",
|
||||||
|
"filter" : ["standard", "lowercase", "remove_numbers"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"filter" : {
|
||||||
|
"remove_numbers" : {
|
||||||
|
"type" : "keep_types",
|
||||||
|
"mode" : "exclude",
|
||||||
|
"types" : [ "<NUM>" ]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
|
||||||
|
And we test it like:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
POST /keep_types_exclude_example/_analyze
|
||||||
|
{
|
||||||
|
"analyzer" : "my_analyzer",
|
||||||
|
"text" : "hello 101 world"
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
// TEST[continued]
|
||||||
|
|
||||||
|
The response will be:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"token": "hello",
|
||||||
|
"start_offset": 0,
|
||||||
|
"end_offset": 5,
|
||||||
|
"type": "<ALPHANUM>",
|
||||||
|
"position": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"token": "world",
|
||||||
|
"start_offset": 10,
|
||||||
|
"end_offset": 15,
|
||||||
|
"type": "<ALPHANUM>",
|
||||||
|
"position": 2
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// TESTRESPONSE
|
|
@ -29,21 +29,47 @@ import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenFilterFactory} for {@link TypeTokenFilter}. This filter only
|
* A {@link TokenFilterFactory} for {@link TypeTokenFilter}. This filter only
|
||||||
* keep tokens that are contained in the set configured via
|
* keep tokens that are contained in the set configured via
|
||||||
* {@value #KEEP_TYPES_KEY} setting.
|
* {@value #KEEP_TYPES_MODE_KEY} setting.
|
||||||
* <p>
|
* <p>
|
||||||
* Configuration options:
|
* Configuration options:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>{@value #KEEP_TYPES_KEY} the array of words / tokens to keep.</li>
|
* <li>{@value #KEEP_TYPES_KEY} the array of words / tokens.</li>
|
||||||
|
* <li>{@value #KEEP_TYPES_MODE_KEY} whether to keep ("include") or discard
|
||||||
|
* ("exclude") the specified token types.</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
|
public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
|
||||||
private final Set<String> keepTypes;
|
private final Set<String> keepTypes;
|
||||||
private static final String KEEP_TYPES_KEY = "types";
|
private final KeepTypesMode includeMode;
|
||||||
|
static final String KEEP_TYPES_KEY = "types";
|
||||||
|
static final String KEEP_TYPES_MODE_KEY = "mode";
|
||||||
|
|
||||||
|
enum KeepTypesMode {
|
||||||
|
INCLUDE, EXCLUDE;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return this.name().toLowerCase(Locale.ROOT);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static KeepTypesMode fromString(String modeString) {
|
||||||
|
String lc = modeString.toLowerCase(Locale.ROOT);
|
||||||
|
if (lc.equals("include")) {
|
||||||
|
return INCLUDE;
|
||||||
|
} else if (lc.equals("exclude")) {
|
||||||
|
return EXCLUDE;
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("`keep_types` tokenfilter mode can only be [" + KeepTypesMode.INCLUDE + "] or ["
|
||||||
|
+ KeepTypesMode.EXCLUDE + "] but was [" + modeString + "].");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
KeepTypesFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
KeepTypesFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||||
super(indexSettings, name, settings);
|
super(indexSettings, name, settings);
|
||||||
|
@ -52,12 +78,12 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
|
||||||
if ((arrayKeepTypes == null)) {
|
if ((arrayKeepTypes == null)) {
|
||||||
throw new IllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured");
|
throw new IllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured");
|
||||||
}
|
}
|
||||||
|
this.includeMode = KeepTypesMode.fromString(settings.get(KEEP_TYPES_MODE_KEY, "include"));
|
||||||
this.keepTypes = new HashSet<>(arrayKeepTypes);
|
this.keepTypes = new HashSet<>(arrayKeepTypes);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
return new TypeTokenFilter(tokenStream, keepTypes, true);
|
return new TypeTokenFilter(tokenStream, keepTypes, includeMode == KeepTypesMode.INCLUDE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,19 +34,51 @@ import java.io.StringReader;
|
||||||
import static org.hamcrest.Matchers.instanceOf;
|
import static org.hamcrest.Matchers.instanceOf;
|
||||||
|
|
||||||
public class KeepTypesFilterFactoryTests extends ESTokenStreamTestCase {
|
public class KeepTypesFilterFactoryTests extends ESTokenStreamTestCase {
|
||||||
public void testKeepTypes() throws IOException {
|
|
||||||
Settings settings = Settings.builder()
|
private static final String BASE_SETTING = "index.analysis.filter.keep_numbers";
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
|
||||||
.put("index.analysis.filter.keep_numbers.type", "keep_types")
|
public void testKeepTypesInclude() throws IOException {
|
||||||
.putList("index.analysis.filter.keep_numbers.types", new String[] {"<NUM>", "<SOMETHINGELSE>"})
|
Settings.Builder settingsBuilder = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.build();
|
.put(BASE_SETTING + ".type", "keep_types")
|
||||||
|
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" });
|
||||||
|
// either use default mode or set "include" mode explicitly
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
settingsBuilder.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY,
|
||||||
|
KeepTypesFilterFactory.KeepTypesMode.INCLUDE);
|
||||||
|
}
|
||||||
|
Settings settings = settingsBuilder.build();
|
||||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
|
||||||
assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
|
assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
|
||||||
String source = "Hello 123 world";
|
String source = "Hello 123 world";
|
||||||
String[] expected = new String[]{"123"};
|
String[] expected = new String[] { "123" };
|
||||||
Tokenizer tokenizer = new StandardTokenizer();
|
Tokenizer tokenizer = new StandardTokenizer();
|
||||||
tokenizer.setReader(new StringReader(source));
|
tokenizer.setReader(new StringReader(source));
|
||||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2});
|
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 2 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testKeepTypesExclude() throws IOException {
|
||||||
|
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
|
.put(BASE_SETTING + ".type", "keep_types")
|
||||||
|
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
|
||||||
|
.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, KeepTypesFilterFactory.KeepTypesMode.EXCLUDE).build();
|
||||||
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||||
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
|
||||||
|
assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
|
||||||
|
String source = "Hello 123 world";
|
||||||
|
String[] expected = new String[] { "Hello", "world" };
|
||||||
|
Tokenizer tokenizer = new StandardTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader(source));
|
||||||
|
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 1, 2 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testKeepTypesException() throws IOException {
|
||||||
|
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
|
.put(BASE_SETTING + ".type", "keep_types")
|
||||||
|
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
|
||||||
|
.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, "bad_parameter").build();
|
||||||
|
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
|
||||||
|
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()));
|
||||||
|
assertEquals("`keep_types` tokenfilter mode can only be [include] or [exclude] but was [bad_parameter].", ex.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue