Add exclusion option to `keep_types` token filter (#32012)

Currently the `keep_types` token filter includes all token types specified using
its `types` parameter. Lucenes TypeTokenFilter also provides a second mode where
instead of keeping the specified tokens (include) they are filtered out
(exclude). This change exposes this option as a new `mode` parameter that can
either take the values `include` (the default, if not specified) or `exclude`.

Closes #29277
This commit is contained in:
Christoph Büscher 2018-07-17 09:04:41 +02:00 committed by GitHub
parent 6717df3c2d
commit 61486680a2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 142 additions and 16 deletions

View File

@ -8,8 +8,9 @@ contained in a predefined set.
[float] [float]
=== Options === Options
[horizontal] [horizontal]
types:: a list of types to keep types:: a list of types to include (default mode) or exclude
mode:: if set to `include` (default) the specified token types will be kept,
if set to `exclude` the specified token types will be removed from the stream
[float] [float]
=== Settings example === Settings example
@ -53,7 +54,7 @@ POST /keep_types_example/_analyze
// CONSOLE // CONSOLE
// TEST[continued] // TEST[continued]
And it'd respond: The response will be:
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
@ -72,3 +73,70 @@ And it'd respond:
// TESTRESPONSE // TESTRESPONSE
Note how only the `<NUM>` token is in the output. Note how only the `<NUM>` token is in the output.
=== Exclude mode settings example
If the `mode` parameter is set to `exclude` like in the following example:
[source,js]
--------------------------------------------------
PUT /keep_types_exclude_example
{
"settings" : {
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"tokenizer" : "standard",
"filter" : ["standard", "lowercase", "remove_numbers"]
}
},
"filter" : {
"remove_numbers" : {
"type" : "keep_types",
"mode" : "exclude",
"types" : [ "<NUM>" ]
}
}
}
}
}
--------------------------------------------------
// CONSOLE
And we test it like:
[source,js]
--------------------------------------------------
POST /keep_types_exclude_example/_analyze
{
"analyzer" : "my_analyzer",
"text" : "hello 101 world"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
The response will be:
[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "hello",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "world",
"start_offset": 10,
"end_offset": 15,
"type": "<ALPHANUM>",
"position": 2
}
]
}
--------------------------------------------------
// TESTRESPONSE

View File

@ -29,21 +29,47 @@ import org.elasticsearch.index.analysis.TokenFilterFactory;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Locale;
import java.util.Set; import java.util.Set;
/** /**
* A {@link TokenFilterFactory} for {@link TypeTokenFilter}. This filter only * A {@link TokenFilterFactory} for {@link TypeTokenFilter}. This filter only
* keep tokens that are contained in the set configured via * keep tokens that are contained in the set configured via
* {@value #KEEP_TYPES_KEY} setting. * {@value #KEEP_TYPES_MODE_KEY} setting.
* <p> * <p>
* Configuration options: * Configuration options:
* <ul> * <ul>
* <li>{@value #KEEP_TYPES_KEY} the array of words / tokens to keep.</li> * <li>{@value #KEEP_TYPES_KEY} the array of words / tokens.</li>
* <li>{@value #KEEP_TYPES_MODE_KEY} whether to keep ("include") or discard
* ("exclude") the specified token types.</li>
* </ul> * </ul>
*/ */
public class KeepTypesFilterFactory extends AbstractTokenFilterFactory { public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
private final Set<String> keepTypes; private final Set<String> keepTypes;
private static final String KEEP_TYPES_KEY = "types"; private final KeepTypesMode includeMode;
static final String KEEP_TYPES_KEY = "types";
static final String KEEP_TYPES_MODE_KEY = "mode";
enum KeepTypesMode {
INCLUDE, EXCLUDE;
@Override
public String toString() {
return this.name().toLowerCase(Locale.ROOT);
}
private static KeepTypesMode fromString(String modeString) {
String lc = modeString.toLowerCase(Locale.ROOT);
if (lc.equals("include")) {
return INCLUDE;
} else if (lc.equals("exclude")) {
return EXCLUDE;
} else {
throw new IllegalArgumentException("`keep_types` tokenfilter mode can only be [" + KeepTypesMode.INCLUDE + "] or ["
+ KeepTypesMode.EXCLUDE + "] but was [" + modeString + "].");
}
}
};
KeepTypesFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { KeepTypesFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings); super(indexSettings, name, settings);
@ -52,12 +78,12 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
if ((arrayKeepTypes == null)) { if ((arrayKeepTypes == null)) {
throw new IllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured"); throw new IllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured");
} }
this.includeMode = KeepTypesMode.fromString(settings.get(KEEP_TYPES_MODE_KEY, "include"));
this.keepTypes = new HashSet<>(arrayKeepTypes); this.keepTypes = new HashSet<>(arrayKeepTypes);
} }
@Override @Override
public TokenStream create(TokenStream tokenStream) { public TokenStream create(TokenStream tokenStream) {
return new TypeTokenFilter(tokenStream, keepTypes, true); return new TypeTokenFilter(tokenStream, keepTypes, includeMode == KeepTypesMode.INCLUDE);
} }
} }

View File

@ -34,19 +34,51 @@ import java.io.StringReader;
import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.instanceOf;
public class KeepTypesFilterFactoryTests extends ESTokenStreamTestCase { public class KeepTypesFilterFactoryTests extends ESTokenStreamTestCase {
public void testKeepTypes() throws IOException {
Settings settings = Settings.builder() private static final String BASE_SETTING = "index.analysis.filter.keep_numbers";
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.keep_numbers.type", "keep_types") public void testKeepTypesInclude() throws IOException {
.putList("index.analysis.filter.keep_numbers.types", new String[] {"<NUM>", "<SOMETHINGELSE>"}) Settings.Builder settingsBuilder = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build(); .put(BASE_SETTING + ".type", "keep_types")
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" });
// either use default mode or set "include" mode explicitly
if (random().nextBoolean()) {
settingsBuilder.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY,
KeepTypesFilterFactory.KeepTypesMode.INCLUDE);
}
Settings settings = settingsBuilder.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class)); assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
String source = "Hello 123 world"; String source = "Hello 123 world";
String[] expected = new String[]{"123"}; String[] expected = new String[] { "123" };
Tokenizer tokenizer = new StandardTokenizer(); Tokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(source)); tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2}); assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 2 });
}
public void testKeepTypesExclude() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(BASE_SETTING + ".type", "keep_types")
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, KeepTypesFilterFactory.KeepTypesMode.EXCLUDE).build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
String source = "Hello 123 world";
String[] expected = new String[] { "Hello", "world" };
Tokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 1, 2 });
}
public void testKeepTypesException() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(BASE_SETTING + ".type", "keep_types")
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, "bad_parameter").build();
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()));
assertEquals("`keep_types` tokenfilter mode can only be [include] or [exclude] but was [bad_parameter].", ex.getMessage());
} }
} }