From 5c7cefa29272e0cf474d0f155d7e6e156179a117 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 15 Aug 2014 09:28:12 -0400 Subject: [PATCH] Analysis: Add keep_types for filtering by token type --- docs/reference/analysis/tokenfilters.asciidoc | 2 + .../keep-types-tokenfilter.asciidoc | 39 +++++++++++ .../index/analysis/AnalysisModule.java | 1 + .../analysis/KeepTypesFilterFactory.java | 69 +++++++++++++++++++ .../index/analysis/AnalysisFactoryTests.java | 3 +- .../analysis/KeepTypesFilterFactoryTests.java | 50 ++++++++++++++ 6 files changed, 162 insertions(+), 2 deletions(-) create mode 100644 docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc create mode 100644 src/main/java/org/elasticsearch/index/analysis/KeepTypesFilterFactory.java create mode 100644 src/test/java/org/elasticsearch/index/analysis/KeepTypesFilterFactoryTests.java diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc index ec46c26de8e..ba2ea71c551 100644 --- a/docs/reference/analysis/tokenfilters.asciidoc +++ b/docs/reference/analysis/tokenfilters.asciidoc @@ -79,6 +79,8 @@ include::tokenfilters/delimited-payload-tokenfilter.asciidoc[] include::tokenfilters/keep-words-tokenfilter.asciidoc[] +include::tokenfilters/keep-types-tokenfilter.asciidoc[] + include::tokenfilters/classic-tokenfilter.asciidoc[] include::tokenfilters/apostrophe-tokenfilter.asciidoc[] diff --git a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc new file mode 100644 index 00000000000..8e504e47306 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc @@ -0,0 +1,39 @@ +[[analysis-keep-types-tokenfilter]] +=== Keep Types Token Filter + +coming[1.4.0] + +A token filter of type `keep_types` that only keeps tokens with a token type +contained in a predefined set. + + +[float] +=== Options +[horizontal] +types:: a list of types to keep + + +[float] +=== Settings example + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "analyzer" : { + "my_analyzer" : { + "tokenizer" : "standard", + "filter" : ["standard", "lowercase", "extract_numbers"] + }, + }, + "filter" : { + "extract_numbers" : { + "type" : "keep_types", + "types" : [ "" ] + }, + } + } + } +} +-------------------------------------------------- diff --git a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index 2b65c17ac7d..a4904330142 100644 --- a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -485,6 +485,7 @@ public class AnalysisModule extends AbstractModule { tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class); + tokenFiltersBindings.processTokenFilter("keep_types", KeepTypesFilterFactory.class); tokenFiltersBindings.processTokenFilter("pattern_capture", PatternCaptureGroupTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("pattern_replace", PatternReplaceTokenFilterFactory.class); diff --git a/src/main/java/org/elasticsearch/index/analysis/KeepTypesFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/KeepTypesFilterFactory.java new file mode 100644 index 00000000000..5c69a2b03f4 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/KeepTypesFilterFactory.java @@ -0,0 +1,69 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.TypeTokenFilter; +import org.elasticsearch.ElasticsearchIllegalArgumentException; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +/** + * A {@link TokenFilterFactory} for {@link TypeFilter}. This filter only + * keep tokens that are contained in the set configured via + * {@value #KEEP_TYPES_KEY} setting. + *

+ * Configuration options: + *

+ *

+ */ +@AnalysisSettingsRequired +public class KeepTypesFilterFactory extends AbstractTokenFilterFactory { + private final Set keepTypes; + private static final String KEEP_TYPES_KEY = "types"; + + @Inject + public KeepTypesFilterFactory(Index index, @IndexSettings Settings indexSettings, + Environment env, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + + final String[] arrayKeepTypes = settings.getAsArray(KEEP_TYPES_KEY, null); + if ((arrayKeepTypes == null)) { + throw new ElasticsearchIllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured"); + } + + this.keepTypes = new HashSet<>(Arrays.asList(arrayKeepTypes)); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new TypeTokenFilter(version, tokenStream, keepTypes, true); + } +} diff --git a/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java b/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java index aff1fd51452..895ac2c9211 100644 --- a/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java @@ -147,6 +147,7 @@ public class AnalysisFactoryTests extends ElasticsearchTestCase { put("trim", TrimTokenFilterFactory.class); put("truncate", TruncateTokenFilterFactory.class); put("turkishlowercase", LowerCaseTokenFilterFactory.class); + put("type", KeepTypesFilterFactory.class); put("uppercase", UpperCaseTokenFilterFactory.class); put("worddelimiter", WordDelimiterTokenFilterFactory.class); @@ -168,8 +169,6 @@ public class AnalysisFactoryTests extends ElasticsearchTestCase { put("removeduplicates", Void.class); // ??? put("tokenoffsetpayload", Void.class); - // like a stop filter but by token-type - put("type", Void.class); // puts the type into the payload put("typeaspayload", Void.class); }}; diff --git a/src/test/java/org/elasticsearch/index/analysis/KeepTypesFilterFactoryTests.java b/src/test/java/org/elasticsearch/index/analysis/KeepTypesFilterFactoryTests.java new file mode 100644 index 00000000000..425784d64da --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/KeepTypesFilterFactoryTests.java @@ -0,0 +1,50 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.test.ElasticsearchTokenStreamTestCase; +import org.junit.Test; + +import java.io.IOException; +import java.io.StringReader; + +import static org.hamcrest.Matchers.instanceOf; + +public class KeepTypesFilterFactoryTests extends ElasticsearchTokenStreamTestCase { + + @Test + public void testKeepTypes() throws IOException { + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.keep_numbers.type", "keep_types") + .putArray("index.analysis.filter.keep_numbers.types", new String[] {"", ""}) + .build(); + AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings); + TokenFilterFactory tokenFilter = analysisService.tokenFilter("keep_numbers"); + assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class)); + String source = "Hello 123 world"; + String[] expected = new String[]{"123"}; + Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2}); + } +}