Add exclusion option to `keep_types` token filter (#32012)
Currently the `keep_types` token filter includes all token types specified using its `types` parameter. Lucenes TypeTokenFilter also provides a second mode where instead of keeping the specified tokens (include) they are filtered out (exclude). This change exposes this option as a new `mode` parameter that can either take the values `include` (the default, if not specified) or `exclude`. Closes #29277
This commit is contained in:
parent
6717df3c2d
commit
61486680a2
|
@ -8,8 +8,9 @@ contained in a predefined set.
|
|||
[float]
|
||||
=== Options
|
||||
[horizontal]
|
||||
types:: a list of types to keep
|
||||
|
||||
types:: a list of types to include (default mode) or exclude
|
||||
mode:: if set to `include` (default) the specified token types will be kept,
|
||||
if set to `exclude` the specified token types will be removed from the stream
|
||||
|
||||
[float]
|
||||
=== Settings example
|
||||
|
@ -53,7 +54,7 @@ POST /keep_types_example/_analyze
|
|||
// CONSOLE
|
||||
// TEST[continued]
|
||||
|
||||
And it'd respond:
|
||||
The response will be:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
|
@ -72,3 +73,70 @@ And it'd respond:
|
|||
// TESTRESPONSE
|
||||
|
||||
Note how only the `<NUM>` token is in the output.
|
||||
|
||||
=== Exclude mode settings example
|
||||
|
||||
If the `mode` parameter is set to `exclude` like in the following example:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT /keep_types_exclude_example
|
||||
{
|
||||
"settings" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"my_analyzer" : {
|
||||
"tokenizer" : "standard",
|
||||
"filter" : ["standard", "lowercase", "remove_numbers"]
|
||||
}
|
||||
},
|
||||
"filter" : {
|
||||
"remove_numbers" : {
|
||||
"type" : "keep_types",
|
||||
"mode" : "exclude",
|
||||
"types" : [ "<NUM>" ]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
And we test it like:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
POST /keep_types_exclude_example/_analyze
|
||||
{
|
||||
"analyzer" : "my_analyzer",
|
||||
"text" : "hello 101 world"
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
// TEST[continued]
|
||||
|
||||
The response will be:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "hello",
|
||||
"start_offset": 0,
|
||||
"end_offset": 5,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "world",
|
||||
"start_offset": 10,
|
||||
"end_offset": 15,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
--------------------------------------------------
|
||||
// TESTRESPONSE
|
|
@ -29,21 +29,47 @@ import org.elasticsearch.index.analysis.TokenFilterFactory;
|
|||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilterFactory} for {@link TypeTokenFilter}. This filter only
|
||||
* keep tokens that are contained in the set configured via
|
||||
* {@value #KEEP_TYPES_KEY} setting.
|
||||
* {@value #KEEP_TYPES_MODE_KEY} setting.
|
||||
* <p>
|
||||
* Configuration options:
|
||||
* <ul>
|
||||
* <li>{@value #KEEP_TYPES_KEY} the array of words / tokens to keep.</li>
|
||||
* <li>{@value #KEEP_TYPES_KEY} the array of words / tokens.</li>
|
||||
* <li>{@value #KEEP_TYPES_MODE_KEY} whether to keep ("include") or discard
|
||||
* ("exclude") the specified token types.</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
|
||||
private final Set<String> keepTypes;
|
||||
private static final String KEEP_TYPES_KEY = "types";
|
||||
private final KeepTypesMode includeMode;
|
||||
static final String KEEP_TYPES_KEY = "types";
|
||||
static final String KEEP_TYPES_MODE_KEY = "mode";
|
||||
|
||||
enum KeepTypesMode {
|
||||
INCLUDE, EXCLUDE;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return this.name().toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
private static KeepTypesMode fromString(String modeString) {
|
||||
String lc = modeString.toLowerCase(Locale.ROOT);
|
||||
if (lc.equals("include")) {
|
||||
return INCLUDE;
|
||||
} else if (lc.equals("exclude")) {
|
||||
return EXCLUDE;
|
||||
} else {
|
||||
throw new IllegalArgumentException("`keep_types` tokenfilter mode can only be [" + KeepTypesMode.INCLUDE + "] or ["
|
||||
+ KeepTypesMode.EXCLUDE + "] but was [" + modeString + "].");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
KeepTypesFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
@ -52,12 +78,12 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
|
|||
if ((arrayKeepTypes == null)) {
|
||||
throw new IllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured");
|
||||
}
|
||||
|
||||
this.includeMode = KeepTypesMode.fromString(settings.get(KEEP_TYPES_MODE_KEY, "include"));
|
||||
this.keepTypes = new HashSet<>(arrayKeepTypes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new TypeTokenFilter(tokenStream, keepTypes, true);
|
||||
return new TypeTokenFilter(tokenStream, keepTypes, includeMode == KeepTypesMode.INCLUDE);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,19 +34,51 @@ import java.io.StringReader;
|
|||
import static org.hamcrest.Matchers.instanceOf;
|
||||
|
||||
public class KeepTypesFilterFactoryTests extends ESTokenStreamTestCase {
|
||||
public void testKeepTypes() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.keep_numbers.type", "keep_types")
|
||||
.putList("index.analysis.filter.keep_numbers.types", new String[] {"<NUM>", "<SOMETHINGELSE>"})
|
||||
.build();
|
||||
|
||||
private static final String BASE_SETTING = "index.analysis.filter.keep_numbers";
|
||||
|
||||
public void testKeepTypesInclude() throws IOException {
|
||||
Settings.Builder settingsBuilder = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put(BASE_SETTING + ".type", "keep_types")
|
||||
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" });
|
||||
// either use default mode or set "include" mode explicitly
|
||||
if (random().nextBoolean()) {
|
||||
settingsBuilder.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY,
|
||||
KeepTypesFilterFactory.KeepTypesMode.INCLUDE);
|
||||
}
|
||||
Settings settings = settingsBuilder.build();
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
|
||||
assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
|
||||
String source = "Hello 123 world";
|
||||
String[] expected = new String[]{"123"};
|
||||
String[] expected = new String[] { "123" };
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2});
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 2 });
|
||||
}
|
||||
|
||||
public void testKeepTypesExclude() throws IOException {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put(BASE_SETTING + ".type", "keep_types")
|
||||
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
|
||||
.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, KeepTypesFilterFactory.KeepTypesMode.EXCLUDE).build();
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
|
||||
assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
|
||||
String source = "Hello 123 world";
|
||||
String[] expected = new String[] { "Hello", "world" };
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 1, 2 });
|
||||
}
|
||||
|
||||
public void testKeepTypesException() throws IOException {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put(BASE_SETTING + ".type", "keep_types")
|
||||
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
|
||||
.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, "bad_parameter").build();
|
||||
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
|
||||
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()));
|
||||
assertEquals("`keep_types` tokenfilter mode can only be [include] or [exclude] but was [bad_parameter].", ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue