Keep word filter through an error if `keep_word_path` was specified.
Closes #4073
This commit is contained in:
parent
555ececaa9
commit
47969efae9
|
@ -17,14 +17,11 @@ package org.elasticsearch.index.analysis;
|
||||||
* specific language governing permissions and limitations
|
* specific language governing permissions and limitations
|
||||||
* under the License.
|
* under the License.
|
||||||
*/
|
*/
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
|
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||||
import org.elasticsearch.common.inject.Inject;
|
import org.elasticsearch.common.inject.Inject;
|
||||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
|
@ -32,33 +29,31 @@ import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.Index;
|
import org.elasticsearch.index.Index;
|
||||||
import org.elasticsearch.index.settings.IndexSettings;
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenFilterFactory} for {@link KeepWordFilter}. This filter only
|
* A {@link TokenFilterFactory} for {@link KeepWordFilter}. This filter only
|
||||||
* keep tokens that are contained in the term set configured via
|
* keep tokens that are contained in the term set configured via
|
||||||
* {@value #KEEP_WORDS_KEY} setting. This filter acts like an inverse stop
|
* {@value #KEEP_WORDS_KEY} setting. This filter acts like an inverse stop
|
||||||
* filter.
|
* filter.
|
||||||
*
|
* <p/>
|
||||||
* Configuration options:
|
* Configuration options:
|
||||||
*
|
* <p/>
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>{@value #KEEP_WORDS_KEY} the array of words / tokens to keep.</li>
|
* <li>{@value #KEEP_WORDS_KEY} the array of words / tokens to keep.</li>
|
||||||
*
|
* <p/>
|
||||||
* <li>{@value #KEEP_WORDS_PATH_KEY} an reference to a file containing the words
|
* <li>{@value #KEEP_WORDS_PATH_KEY} an reference to a file containing the words
|
||||||
* / tokens to keep. Note: this is an alternative to {@value #KEEP_WORDS_KEY} if
|
* / tokens to keep. Note: this is an alternative to {@value #KEEP_WORDS_KEY} if
|
||||||
* both are set an exception will be thrown.</li>
|
* both are set an exception will be thrown.</li>
|
||||||
*
|
* <p/>
|
||||||
* <li>{@value #ENABLE_POS_INC_KEY} <code>true</code> iff the filter should
|
* <li>{@value #ENABLE_POS_INC_KEY} <code>true</code> iff the filter should
|
||||||
* maintain position increments for dropped tokens. The default is
|
* maintain position increments for dropped tokens. The default is
|
||||||
* <code>true</code>.</li>
|
* <code>true</code>.</li>
|
||||||
*
|
* <p/>
|
||||||
* <li>{@value #KEEP_WORDS_CASE_KEY} to use case sensitive keep words. The
|
* <li>{@value #KEEP_WORDS_CASE_KEY} to use case sensitive keep words. The
|
||||||
* default is <code>false</code> which corresponds to case-sensitive.</li>
|
* default is <code>false</code> which corresponds to case-sensitive.</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*
|
*
|
||||||
* @see StopTokenFilterFactory
|
* @see StopTokenFilterFactory
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
@AnalysisSettingsRequired
|
@AnalysisSettingsRequired
|
||||||
public class KeepWordFilterFactory extends AbstractTokenFilterFactory {
|
public class KeepWordFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
@ -71,19 +66,18 @@ public class KeepWordFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public KeepWordFilterFactory(Index index, @IndexSettings Settings indexSettings,
|
public KeepWordFilterFactory(Index index, @IndexSettings Settings indexSettings,
|
||||||
Environment env, IndicesAnalysisService indicesAnalysisService, Map<String, TokenizerFactoryFactory> tokenizerFactories,
|
Environment env, @Assisted String name, @Assisted Settings settings) {
|
||||||
@Assisted String name, @Assisted Settings settings) {
|
|
||||||
super(index, indexSettings, name, settings);
|
super(index, indexSettings, name, settings);
|
||||||
|
|
||||||
final String[] arrayKeepWords = settings.getAsArray(KEEP_WORDS_KEY);
|
final String[] arrayKeepWords = settings.getAsArray(KEEP_WORDS_KEY, null);
|
||||||
final String keepWordsPath = settings.get(KEEP_WORDS_PATH_KEY, null);
|
final String keepWordsPath = settings.get(KEEP_WORDS_PATH_KEY, null);
|
||||||
if (!(arrayKeepWords == null ^ keepWordsPath == null)) {
|
if ((arrayKeepWords == null && keepWordsPath == null) || (arrayKeepWords != null && keepWordsPath != null)) {
|
||||||
// we don't allow both or non
|
// we don't allow both or none
|
||||||
throw new ElasticSearchIllegalArgumentException("keep requires either `" + KEEP_WORDS_KEY + "` or `"
|
throw new ElasticSearchIllegalArgumentException("keep requires either `" + KEEP_WORDS_KEY + "` or `"
|
||||||
+ KEEP_WORDS_PATH_KEY + "` to be configured");
|
+ KEEP_WORDS_PATH_KEY + "` to be configured");
|
||||||
}
|
}
|
||||||
if (version.onOrAfter(Version.LUCENE_44) && settings.get(ENABLE_POS_INC_KEY) != null) {
|
if (version.onOrAfter(Version.LUCENE_44) && settings.get(ENABLE_POS_INC_KEY) != null) {
|
||||||
throw new ElasticSearchIllegalArgumentException(ENABLE_POS_INC_KEY + " is not supported anymore. Please fix your analysis chain or use"
|
throw new ElasticSearchIllegalArgumentException(ENABLE_POS_INC_KEY + " is not supported anymore. Please fix your analysis chain or use"
|
||||||
+ " an older compatibility version (<=4.3) but beware that it might cause highlighting bugs.");
|
+ " an older compatibility version (<=4.3) but beware that it might cause highlighting bugs.");
|
||||||
}
|
}
|
||||||
enablePositionIncrements = version.onOrAfter(Version.LUCENE_44) ? true : settings.getAsBoolean(ENABLE_POS_INC_KEY, true);
|
enablePositionIncrements = version.onOrAfter(Version.LUCENE_44) ? true : settings.getAsBoolean(ENABLE_POS_INC_KEY, true);
|
||||||
|
@ -101,7 +95,4 @@ public class KeepWordFilterFactory extends AbstractTokenFilterFactory {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.env.FailedToResolveConfigException;
|
||||||
import org.elasticsearch.test.ElasticsearchTokenStreamTestCase;
|
import org.elasticsearch.test.ElasticsearchTokenStreamTestCase;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
@ -48,10 +49,10 @@ public class KeepFilterFactoryTests extends ElasticsearchTokenStreamTestCase {
|
||||||
@Test
|
@Test
|
||||||
public void testLoadOverConfiguredSettings() {
|
public void testLoadOverConfiguredSettings() {
|
||||||
Settings settings = ImmutableSettings.settingsBuilder()
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
.put("index.analysis.filter.broken_keep_filter.type", "keep")
|
.put("index.analysis.filter.broken_keep_filter.type", "keep")
|
||||||
.put("index.analysis.filter.broken_keep_filter.keep_words_path", "does/not/exists.txt")
|
.put("index.analysis.filter.broken_keep_filter.keep_words_path", "does/not/exists.txt")
|
||||||
.put("index.analysis.filter.broken_keep_filter.keep_words", "[\"Hello\", \"worlD\"]")
|
.put("index.analysis.filter.broken_keep_filter.keep_words", "[\"Hello\", \"worlD\"]")
|
||||||
.build();
|
.build();
|
||||||
try {
|
try {
|
||||||
AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||||
Assert.fail("path and array are configured");
|
Assert.fail("path and array are configured");
|
||||||
|
@ -60,6 +61,33 @@ public class KeepFilterFactoryTests extends ElasticsearchTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testKeepWordsPathSettings() {
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.non_broken_keep_filter.type", "keep")
|
||||||
|
.put("index.analysis.filter.non_broken_keep_filter.keep_words_path", "does/not/exists.txt")
|
||||||
|
.build();
|
||||||
|
try {
|
||||||
|
// test our none existing setup is picked up
|
||||||
|
AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||||
|
fail("expected an exception due to non existent keep_words_path");
|
||||||
|
} catch (Throwable e) {
|
||||||
|
assertThat(e.getCause(), instanceOf(FailedToResolveConfigException.class));
|
||||||
|
}
|
||||||
|
|
||||||
|
settings = ImmutableSettings.settingsBuilder().put(settings)
|
||||||
|
.put("index.analysis.filter.non_broken_keep_filter.keep_words", new String[]{"test"})
|
||||||
|
.build();
|
||||||
|
try {
|
||||||
|
// test our none existing setup is picked up
|
||||||
|
AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||||
|
fail("expected an exception indicating that you can't use [keep_words_path] with [keep_words] ");
|
||||||
|
} catch (Throwable e) {
|
||||||
|
assertThat(e.getCause(), instanceOf(ElasticSearchIllegalArgumentException.class));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCaseInsensitiveMapping() throws IOException {
|
public void testCaseInsensitiveMapping() throws IOException {
|
||||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(RESOURCE);
|
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(RESOURCE);
|
||||||
|
@ -68,7 +96,7 @@ public class KeepFilterFactoryTests extends ElasticsearchTokenStreamTestCase {
|
||||||
String source = "hello small world";
|
String source = "hello small world";
|
||||||
String[] expected = new String[]{"hello", "world"};
|
String[] expected = new String[]{"hello", "world"};
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(source));
|
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(source));
|
||||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] {1,2});
|
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1, 2});
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -79,7 +107,7 @@ public class KeepFilterFactoryTests extends ElasticsearchTokenStreamTestCase {
|
||||||
String source = "Hello small world";
|
String source = "Hello small world";
|
||||||
String[] expected = new String[]{"Hello"};
|
String[] expected = new String[]{"Hello"};
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(source));
|
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(source));
|
||||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] {1});
|
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1});
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue