Keep word filter through an error if `keep_word_path` was specified.

Closes #4073
This commit is contained in:
Vojtech Hyza 2013-09-13 14:29:22 +02:00 committed by Boaz Leskes
parent 555ececaa9
commit 47969efae9
2 changed files with 55 additions and 36 deletions

View File

@ -17,14 +17,11 @@ package org.elasticsearch.index.analysis;
* specific language governing permissions and limitations * specific language governing permissions and limitations
* under the License. * under the License.
*/ */
import org.apache.lucene.util.Version;
import java.util.Arrays;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter; import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.inject.assistedinject.Assisted;
@ -32,33 +29,31 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index; import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings; import org.elasticsearch.index.settings.IndexSettings;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
/** /**
* A {@link TokenFilterFactory} for {@link KeepWordFilter}. This filter only * A {@link TokenFilterFactory} for {@link KeepWordFilter}. This filter only
* keep tokens that are contained in the term set configured via * keep tokens that are contained in the term set configured via
* {@value #KEEP_WORDS_KEY} setting. This filter acts like an inverse stop * {@value #KEEP_WORDS_KEY} setting. This filter acts like an inverse stop
* filter. * filter.
* * <p/>
* Configuration options: * Configuration options:
* * <p/>
* <ul> * <ul>
* <li>{@value #KEEP_WORDS_KEY} the array of words / tokens to keep.</li> * <li>{@value #KEEP_WORDS_KEY} the array of words / tokens to keep.</li>
* * <p/>
* <li>{@value #KEEP_WORDS_PATH_KEY} an reference to a file containing the words * <li>{@value #KEEP_WORDS_PATH_KEY} an reference to a file containing the words
* / tokens to keep. Note: this is an alternative to {@value #KEEP_WORDS_KEY} if * / tokens to keep. Note: this is an alternative to {@value #KEEP_WORDS_KEY} if
* both are set an exception will be thrown.</li> * both are set an exception will be thrown.</li>
* * <p/>
* <li>{@value #ENABLE_POS_INC_KEY} <code>true</code> iff the filter should * <li>{@value #ENABLE_POS_INC_KEY} <code>true</code> iff the filter should
* maintain position increments for dropped tokens. The default is * maintain position increments for dropped tokens. The default is
* <code>true</code>.</li> * <code>true</code>.</li>
* * <p/>
* <li>{@value #KEEP_WORDS_CASE_KEY} to use case sensitive keep words. The * <li>{@value #KEEP_WORDS_CASE_KEY} to use case sensitive keep words. The
* default is <code>false</code> which corresponds to case-sensitive.</li> * default is <code>false</code> which corresponds to case-sensitive.</li>
* </ul> * </ul>
* *
* @see StopTokenFilterFactory * @see StopTokenFilterFactory
*
*/ */
@AnalysisSettingsRequired @AnalysisSettingsRequired
public class KeepWordFilterFactory extends AbstractTokenFilterFactory { public class KeepWordFilterFactory extends AbstractTokenFilterFactory {
@ -71,19 +66,18 @@ public class KeepWordFilterFactory extends AbstractTokenFilterFactory {
@Inject @Inject
public KeepWordFilterFactory(Index index, @IndexSettings Settings indexSettings, public KeepWordFilterFactory(Index index, @IndexSettings Settings indexSettings,
Environment env, IndicesAnalysisService indicesAnalysisService, Map<String, TokenizerFactoryFactory> tokenizerFactories, Environment env, @Assisted String name, @Assisted Settings settings) {
@Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings); super(index, indexSettings, name, settings);
final String[] arrayKeepWords = settings.getAsArray(KEEP_WORDS_KEY); final String[] arrayKeepWords = settings.getAsArray(KEEP_WORDS_KEY, null);
final String keepWordsPath = settings.get(KEEP_WORDS_PATH_KEY, null); final String keepWordsPath = settings.get(KEEP_WORDS_PATH_KEY, null);
if (!(arrayKeepWords == null ^ keepWordsPath == null)) { if ((arrayKeepWords == null && keepWordsPath == null) || (arrayKeepWords != null && keepWordsPath != null)) {
// we don't allow both or non // we don't allow both or none
throw new ElasticSearchIllegalArgumentException("keep requires either `" + KEEP_WORDS_KEY + "` or `" throw new ElasticSearchIllegalArgumentException("keep requires either `" + KEEP_WORDS_KEY + "` or `"
+ KEEP_WORDS_PATH_KEY + "` to be configured"); + KEEP_WORDS_PATH_KEY + "` to be configured");
} }
if (version.onOrAfter(Version.LUCENE_44) && settings.get(ENABLE_POS_INC_KEY) != null) { if (version.onOrAfter(Version.LUCENE_44) && settings.get(ENABLE_POS_INC_KEY) != null) {
throw new ElasticSearchIllegalArgumentException(ENABLE_POS_INC_KEY + " is not supported anymore. Please fix your analysis chain or use" throw new ElasticSearchIllegalArgumentException(ENABLE_POS_INC_KEY + " is not supported anymore. Please fix your analysis chain or use"
+ " an older compatibility version (<=4.3) but beware that it might cause highlighting bugs."); + " an older compatibility version (<=4.3) but beware that it might cause highlighting bugs.");
} }
enablePositionIncrements = version.onOrAfter(Version.LUCENE_44) ? true : settings.getAsBoolean(ENABLE_POS_INC_KEY, true); enablePositionIncrements = version.onOrAfter(Version.LUCENE_44) ? true : settings.getAsBoolean(ENABLE_POS_INC_KEY, true);
@ -101,7 +95,4 @@ public class KeepWordFilterFactory extends AbstractTokenFilterFactory {
} }
} }

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.FailedToResolveConfigException;
import org.elasticsearch.test.ElasticsearchTokenStreamTestCase; import org.elasticsearch.test.ElasticsearchTokenStreamTestCase;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
@ -48,10 +49,10 @@ public class KeepFilterFactoryTests extends ElasticsearchTokenStreamTestCase {
@Test @Test
public void testLoadOverConfiguredSettings() { public void testLoadOverConfiguredSettings() {
Settings settings = ImmutableSettings.settingsBuilder() Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.broken_keep_filter.type", "keep") .put("index.analysis.filter.broken_keep_filter.type", "keep")
.put("index.analysis.filter.broken_keep_filter.keep_words_path", "does/not/exists.txt") .put("index.analysis.filter.broken_keep_filter.keep_words_path", "does/not/exists.txt")
.put("index.analysis.filter.broken_keep_filter.keep_words", "[\"Hello\", \"worlD\"]") .put("index.analysis.filter.broken_keep_filter.keep_words", "[\"Hello\", \"worlD\"]")
.build(); .build();
try { try {
AnalysisTestsHelper.createAnalysisServiceFromSettings(settings); AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
Assert.fail("path and array are configured"); Assert.fail("path and array are configured");
@ -60,6 +61,33 @@ public class KeepFilterFactoryTests extends ElasticsearchTokenStreamTestCase {
} }
} }
@Test
public void testKeepWordsPathSettings() {
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.non_broken_keep_filter.type", "keep")
.put("index.analysis.filter.non_broken_keep_filter.keep_words_path", "does/not/exists.txt")
.build();
try {
// test our none existing setup is picked up
AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
fail("expected an exception due to non existent keep_words_path");
} catch (Throwable e) {
assertThat(e.getCause(), instanceOf(FailedToResolveConfigException.class));
}
settings = ImmutableSettings.settingsBuilder().put(settings)
.put("index.analysis.filter.non_broken_keep_filter.keep_words", new String[]{"test"})
.build();
try {
// test our none existing setup is picked up
AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
fail("expected an exception indicating that you can't use [keep_words_path] with [keep_words] ");
} catch (Throwable e) {
assertThat(e.getCause(), instanceOf(ElasticSearchIllegalArgumentException.class));
}
}
@Test @Test
public void testCaseInsensitiveMapping() throws IOException { public void testCaseInsensitiveMapping() throws IOException {
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(RESOURCE); AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(RESOURCE);
@ -68,7 +96,7 @@ public class KeepFilterFactoryTests extends ElasticsearchTokenStreamTestCase {
String source = "hello small world"; String source = "hello small world";
String[] expected = new String[]{"hello", "world"}; String[] expected = new String[]{"hello", "world"};
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(source)); Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] {1,2}); assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1, 2});
} }
@Test @Test
@ -79,7 +107,7 @@ public class KeepFilterFactoryTests extends ElasticsearchTokenStreamTestCase {
String source = "Hello small world"; String source = "Hello small world";
String[] expected = new String[]{"Hello"}; String[] expected = new String[]{"Hello"};
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(source)); Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] {1}); assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1});
} }
} }