From 4d19239ec4966c82b2cc9793902686f0cbab0fcf Mon Sep 17 00:00:00 2001 From: Alexander Reelsen Date: Tue, 15 Oct 2013 10:56:57 +0200 Subject: [PATCH] Add support for Lucene SuggestStopFilter The suggest stop filter is an improved version of the stop filter, which takes stopwords only into account if the last char of a query is a whitespace. This allows you to keep stopwords, but to allow suggesting for "a". Example: Index document content "a word". You are now able to suggest for "a" and get back results in the completion suggester, if the suggest stop filter is used on the query side, but will not get back any results for "a " as this is identified as a stopword. The implementation allows to set the `remove_trailing` parameter for a custom stop filter and thus use the suggest stop filter instead of the standard stop filter. --- .../tokenfilters/stop-tokenfilter.asciidoc | 5 ++++ .../analysis/StopTokenFilterFactory.java | 13 +++++++-- .../index/analysis/StopTokenFilterTests.java | 15 ++++++++-- .../suggest/CompletionSuggestSearchTests.java | 29 +++++++++++++++++++ 4 files changed, 57 insertions(+), 5 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc index e940d62b3ae..14b3a32b2f8 100644 --- a/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc @@ -20,6 +20,11 @@ encoded. |`ignore_case` |Set to `true` to lower case all words first. Defaults to `false`. + +|`remove_trailing` |Set to `false` in order to not ignore the last term of +a search if it is a stop word. This is very useful for the completion +suggester as a query like `green a` can be extended to `green apple` even +though you remove stop words in general. Defaults to `true`. |======================================================================= stopwords allow for custom language specific expansion of default diff --git a/src/main/java/org/elasticsearch/index/analysis/StopTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/StopTokenFilterFactory.java index 3eba7f7a871..a87c5f048c5 100644 --- a/src/main/java/org/elasticsearch/index/analysis/StopTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/StopTokenFilterFactory.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter; import org.apache.lucene.util.Version; import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.common.inject.Inject; @@ -45,11 +46,13 @@ public class StopTokenFilterFactory extends AbstractTokenFilterFactory { private final boolean ignoreCase; private final boolean enablePositionIncrements; + private final boolean removeTrailing; @Inject public StopTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); this.ignoreCase = settings.getAsBoolean("ignore_case", false); + this.removeTrailing = settings.getAsBoolean("remove_trailing", true); this.stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET, version, ignoreCase); this.enablePositionIncrements = settings.getAsBoolean("enable_position_increments", true); if (!enablePositionIncrements && version.onOrAfter(Version.LUCENE_44)) { @@ -60,9 +63,13 @@ public class StopTokenFilterFactory extends AbstractTokenFilterFactory { @Override public TokenStream create(TokenStream tokenStream) { - StopFilter filter = new StopFilter(version, tokenStream, stopWords); - filter.setEnablePositionIncrements(enablePositionIncrements); - return filter; + if (removeTrailing) { + StopFilter filter = new StopFilter(version, tokenStream, stopWords); + filter.setEnablePositionIncrements(enablePositionIncrements); + return filter; + } else { + return new SuggestStopFilter(tokenStream, stopWords); + } } public Set stopWords() { diff --git a/src/test/java/org/elasticsearch/index/analysis/StopTokenFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/StopTokenFilterTests.java index 65ed7bb566f..b389790b6fd 100644 --- a/src/test/java/org/elasticsearch/index/analysis/StopTokenFilterTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/StopTokenFilterTests.java @@ -22,6 +22,7 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter; import org.apache.lucene.util.Version; import org.elasticsearch.common.inject.ProvisionException; import org.elasticsearch.common.settings.ImmutableSettings; @@ -66,7 +67,6 @@ public class StopTokenFilterTests extends ElasticsearchTokenStreamTestCase { TokenStream create = tokenFilter.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("foo bar"))); assertThat(create, instanceOf(StopFilter.class)); assertThat(((StopFilter)create).getEnablePositionIncrements(), equalTo(true)); - } @Test @@ -80,7 +80,18 @@ public class StopTokenFilterTests extends ElasticsearchTokenStreamTestCase { TokenStream create = tokenFilter.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("foo bar"))); assertThat(create, instanceOf(StopFilter.class)); assertThat(((StopFilter)create).getEnablePositionIncrements(), equalTo(false)); - } + @Test + public void testThatSuggestStopFilterWorks() throws Exception { + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.my_stop.type", "stop") + .put("index.analysis.filter.my_stop.remove_trailing", false) + .build(); + AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings); + TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_stop"); + assertThat(tokenFilter, instanceOf(StopTokenFilterFactory.class)); + TokenStream create = tokenFilter.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("foo an"))); + assertThat(create, instanceOf(SuggestStopFilter.class)); + } } diff --git a/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchTests.java b/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchTests.java index b75683c7c9e..db4e0bc0193 100644 --- a/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchTests.java +++ b/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchTests.java @@ -578,6 +578,35 @@ public class CompletionSuggestSearchTests extends AbstractIntegrationTest { } } + @Test + public void testThatSuggestStopFilterWorks() throws Exception { + ImmutableSettings.Builder settingsBuilder = settingsBuilder() + .put("index.analysis.analyzer.stoptest.tokenizer", "standard") + .putArray("index.analysis.analyzer.stoptest.filter", "standard", "suggest_stop_filter") + .put("index.analysis.filter.suggest_stop_filter.type", "stop") + .put("index.analysis.filter.suggest_stop_filter.remove_trailing", false); + + createIndexAndMappingAndSettings(settingsBuilder, "simple", "stoptest", true, true, true); + + client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder() + .startObject().field(FIELD, "Feed trolls").endObject() + ).get(); + + client().prepareIndex(INDEX, TYPE, "2").setSource(jsonBuilder() + .startObject().field(FIELD, "Feed the trolls").endObject() + ).get(); + + refresh(); + + assertSuggestions("feed t", "Feed the trolls", "Feed trolls"); + assertSuggestions("feed th", "Feed the trolls"); + assertSuggestions("feed the", "Feed the trolls"); + // stop word complete, gets ignored on query time, makes it "feed" only + assertSuggestions("feed the ", "Feed the trolls", "Feed trolls"); + // stopword gets removed, but position increment kicks in, which doesnt work for the prefix suggester + assertSuggestions("feed the t"); + } + @Test(expected = MapperParsingException.class) public void testThatIndexingInvalidFieldsInCompletionFieldResultsInException() throws Exception { createIndexAndMapping();