Change 'standart' analyzer to use emtpy stopword list by default.

The 'default' / 'standard' analyzer can be a trappy default sicne it filters
english stopwords by default. Yet a default should not be dedicated to a certain language
since elasticsearch is used in many different scenarios where a standard analysis chain
with specialization to english full-text might be rather counter productive.

This commit changes the 'standard' analyzer to use an empty stopword list for indices
that are created from 1.0.0.Beta1 version onwards but will maintain backwards compatibiliy
for older indices.

Closes #3775
This commit is contained in:
Simon Willnauer 2013-11-05 17:25:04 +01:00
parent 1586339ee0
commit 9654631186
10 changed files with 115 additions and 28 deletions

View File

@ -18,8 +18,8 @@ type:
|=======================================================================
|Setting |Description
|`stopwords` |A list of stopword to initialize the stop filter with.
Defaults to the english stop words.
Defaults to an 'empty' stopword list coming[1.0.0.Beta1, Previously
defaulted to the English stopwords list]
|`max_token_length` |The maximum token length. If a token is seen that
exceeds this length then it is discarded. Defaults to `255`.
|=======================================================================

View File

@ -22,6 +22,8 @@ package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
@ -35,11 +37,20 @@ import org.elasticsearch.index.settings.IndexSettings;
public class StandardAnalyzerProvider extends AbstractIndexAnalyzerProvider<StandardAnalyzer> {
private final StandardAnalyzer standardAnalyzer;
private final Version esVersion;
@Inject
public StandardAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
CharArraySet stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET, version);
this.esVersion = indexSettings.getAsVersion(IndexMetaData.SETTING_VERSION_CREATED, org.elasticsearch.Version.CURRENT);
final CharArraySet defaultStopwords;
if (esVersion.onOrAfter(Version.V_1_0_0_Beta1)) {
defaultStopwords = CharArraySet.EMPTY_SET;
} else {
defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords, version);
int maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
standardAnalyzer = new StandardAnalyzer(version, stopWords);
standardAnalyzer.setMaxTokenLength(maxTokenLength);

View File

@ -61,6 +61,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.sv.SwedishAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.Version;
import org.elasticsearch.common.regex.Regex;
@ -73,9 +74,12 @@ import java.util.Map;
*/
public enum PreBuiltAnalyzers {
STANDARD() {
STANDARD(CachingStrategy.ELASTICSEARCH) { // we don't do stopwords anymore from 1.0Beta on
@Override
protected Analyzer create(Version version) {
if (version.onOrAfter(Version.V_1_0_0_Beta1)) {
return new StandardAnalyzer(version.luceneVersion, CharArraySet.EMPTY_SET);
}
return new StandardAnalyzer(version.luceneVersion);
}
},

View File

@ -122,7 +122,7 @@ public class SimpleQueryTests extends AbstractIntegrationTest {
countResponse = client().prepareCount().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.OR)).execute().actionGet();
assertHitCount(countResponse, 3l);
countResponse = client().prepareCount().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND).analyzer("standard")).execute().actionGet();
countResponse = client().prepareCount().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND).analyzer("stop")).execute().actionGet();
assertHitCount(countResponse, 3l);
// standard drops "the" since its a stopword
@ -480,8 +480,13 @@ public class SimpleQueryTests extends AbstractIntegrationTest {
}
@Test
public void testMatchQueryZeroTermsQuery() {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 1)).execute().actionGet();
public void testMatchQueryZeroTermsQuery() throws IOException {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 1))
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("field1").field("type", "string").field("analyzer", "classic").endObject()
.startObject("field2").field("type", "string").field("analyzer", "classic").endObject()
.endObject().endObject().endObject())
.execute().actionGet();
client().prepareIndex("test", "type1", "1").setSource("field1", "value1").execute().actionGet();
client().prepareIndex("test", "type1", "2").setSource("field1", "value2").execute().actionGet();
client().admin().indices().prepareRefresh("test").execute().actionGet();
@ -511,8 +516,13 @@ public class SimpleQueryTests extends AbstractIntegrationTest {
}
@Test
public void testMultiMatchQueryZeroTermsQuery() {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 1)).execute().actionGet();
public void testMultiMatchQueryZeroTermsQuery() throws IOException {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 1))
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("field1").field("type", "string").field("analyzer", "classic").endObject()
.startObject("field2").field("type", "string").field("analyzer", "classic").endObject()
.endObject().endObject().endObject())
.execute().actionGet();
client().prepareIndex("test", "type1", "1").setSource("field1", "value1", "field2", "value2").execute().actionGet();
client().prepareIndex("test", "type1", "2").setSource("field1", "value3", "field2", "value4").execute().actionGet();
client().admin().indices().prepareRefresh("test").execute().actionGet();

View File

@ -19,6 +19,8 @@
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.ImmutableSettings;
@ -32,10 +34,11 @@ import org.elasticsearch.test.ElasticsearchTestCase;
import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.*;
/**
*
@ -51,6 +54,43 @@ public class PreBuiltAnalyzerTests extends ElasticsearchTestCase {
assertThat(currentDefaultAnalyzer, is(currentStandardAnalyzer));
}
@Test
public void testThatDefaultAndStandardAnalyzerChangedIn10Beta1() throws IOException {
Analyzer currentStandardAnalyzer = PreBuiltAnalyzers.STANDARD.getAnalyzer(Version.V_1_0_0_Beta1);
Analyzer currentDefaultAnalyzer = PreBuiltAnalyzers.DEFAULT.getAnalyzer(Version.V_1_0_0_Beta1);
// special case, these two are the same instance
assertThat(currentDefaultAnalyzer, is(currentStandardAnalyzer));
PreBuiltAnalyzers.DEFAULT.getAnalyzer(Version.V_1_0_0_Beta1);
final int n = atLeast(10);
Version version = Version.CURRENT;
for(int i = 0; i < n; i++) {
if (version.equals(Version.V_1_0_0_Beta1)) {
assertThat(currentDefaultAnalyzer, is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version)));
} else {
assertThat(currentDefaultAnalyzer, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version))));
}
Analyzer analyzer = PreBuiltAnalyzers.DEFAULT.getAnalyzer(version);
TokenStream ts = analyzer.tokenStream("foo", "This is it Dude");
ts.reset();
CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
List<String> list = new ArrayList<String>();
while(ts.incrementToken()) {
list.add(charTermAttribute.toString());
}
if (version.onOrAfter(Version.V_1_0_0_Beta1)) {
assertThat(list.size(), is(4));
assertThat(list, contains("this", "is", "it", "dude"));
} else {
assertThat(list.size(), is(1));
assertThat(list, contains("dude"));
}
ts.close();
version = randomVersion();
}
}
@Test
public void testThatInstancesAreTheSameAlwaysForKeywordAnalyzer() {
assertThat(PreBuiltAnalyzers.KEYWORD.getAnalyzer(Version.CURRENT),

View File

@ -50,8 +50,20 @@ public class AnalyzeActionTests extends AbstractIntegrationTest {
for (int i = 0; i < 10; i++) {
AnalyzeResponse analyzeResponse = client().admin().indices().prepareAnalyze("test", "this is a test").execute().actionGet();
assertThat(analyzeResponse.getTokens().size(), equalTo(1));
assertThat(analyzeResponse.getTokens().size(), equalTo(4));
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(0);
assertThat(token.getTerm(), equalTo("this"));
assertThat(token.getStartOffset(), equalTo(0));
assertThat(token.getEndOffset(), equalTo(4));
token = analyzeResponse.getTokens().get(1);
assertThat(token.getTerm(), equalTo("is"));
assertThat(token.getStartOffset(), equalTo(5));
assertThat(token.getEndOffset(), equalTo(7));
token = analyzeResponse.getTokens().get(2);
assertThat(token.getTerm(), equalTo("a"));
assertThat(token.getStartOffset(), equalTo(8));
assertThat(token.getEndOffset(), equalTo(9));
token = analyzeResponse.getTokens().get(3);
assertThat(token.getTerm(), equalTo("test"));
assertThat(token.getStartOffset(), equalTo(10));
assertThat(token.getEndOffset(), equalTo(14));

View File

@ -1048,7 +1048,7 @@ public class HighlighterSearchTests extends AbstractIntegrationTest {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 2))
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("title").field("type", "multi_field").startObject("fields")
.startObject("title").field("type", "string").field("store", "yes").field("term_vector", "with_positions_offsets").endObject()
.startObject("title").field("type", "string").field("store", "yes").field("term_vector", "with_positions_offsets").field("analyzer", "classic").endObject()
.startObject("key").field("type", "string").field("store", "yes").field("term_vector", "with_positions_offsets").field("analyzer", "whitespace").endObject()
.endObject().endObject()
.endObject().endObject().endObject())
@ -1084,7 +1084,7 @@ public class HighlighterSearchTests extends AbstractIntegrationTest {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 2))
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("title").field("type", "multi_field").startObject("fields")
.startObject("title").field("type", "string").field("store", "no").field("term_vector", "with_positions_offsets").endObject()
.startObject("title").field("type", "string").field("store", "no").field("term_vector", "with_positions_offsets").field("analyzer", "classic").endObject()
.startObject("key").field("type", "string").field("store", "no").field("term_vector", "with_positions_offsets").field("analyzer", "whitespace").endObject()
.endObject().endObject()
.endObject().endObject().endObject())
@ -1123,7 +1123,7 @@ public class HighlighterSearchTests extends AbstractIntegrationTest {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 2))
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("title").field("type", "multi_field").startObject("fields")
.startObject("title").field("type", "string").field("store", "yes").field("term_vector", "no").endObject()
.startObject("title").field("type", "string").field("store", "yes").field("term_vector", "no").field("analyzer", "classic").endObject()
.startObject("key").field("type", "string").field("store", "yes").field("term_vector", "no").field("analyzer", "whitespace").endObject()
.endObject().endObject()
.endObject().endObject().endObject())
@ -1161,7 +1161,7 @@ public class HighlighterSearchTests extends AbstractIntegrationTest {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 2))
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("title").field("type", "multi_field").startObject("fields")
.startObject("title").field("type", "string").field("store", "no").field("term_vector", "no").endObject()
.startObject("title").field("type", "string").field("store", "no").field("term_vector", "no").field("analyzer", "classic").endObject()
.startObject("key").field("type", "string").field("store", "no").field("term_vector", "no").field("analyzer", "whitespace").endObject()
.endObject().endObject()
.endObject().endObject().endObject())
@ -1230,7 +1230,7 @@ public class HighlighterSearchTests extends AbstractIntegrationTest {
public void testDisableFastVectorHighlighter() throws Exception {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 2))
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("title").field("type", "string").field("store", "yes").field("term_vector", "with_positions_offsets").endObject()
.startObject("title").field("type", "string").field("store", "yes").field("term_vector", "with_positions_offsets").field("analyzer", "classic").endObject()
.endObject().endObject().endObject())
.execute().actionGet();
ensureGreen();
@ -2326,7 +2326,7 @@ public class HighlighterSearchTests extends AbstractIntegrationTest {
.startObject("_source").field("enabled", false).endObject()
.startObject("properties")
.startObject("title").field("type", "multi_field").startObject("fields")
.startObject("title").field("type", "string").field("store", "yes").field("index_options", "offsets").endObject()
.startObject("title").field("type", "string").field("store", "yes").field("index_options", "offsets").field("analyzer", "classic").endObject()
.startObject("key").field("type", "string").field("store", "yes").field("index_options", "offsets").field("analyzer", "whitespace").endObject()
.endObject().endObject()
.endObject().endObject().endObject())
@ -2363,7 +2363,7 @@ public class HighlighterSearchTests extends AbstractIntegrationTest {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 2))
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("title").field("type", "multi_field").startObject("fields")
.startObject("title").field("type", "string").field("store", "no").field("index_options", "offsets").endObject()
.startObject("title").field("type", "string").field("store", "no").field("index_options", "offsets").field("analyzer", "classic").endObject()
.startObject("key").field("type", "string").field("store", "no").field("index_options", "offsets").field("analyzer", "whitespace").endObject()
.endObject().endObject()
.endObject().endObject().endObject())

View File

@ -271,7 +271,7 @@ public class SimpleQueryTests extends AbstractIntegrationTest {
assertThat(searchResponse.getHits().totalHits(), equalTo(1l));
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("2"));
searchResponse = client().prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3).analyzer("standard")).execute().actionGet();
searchResponse = client().prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3).analyzer("stop")).execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(3l));
// standard drops "the" since its a stopword
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
@ -290,7 +290,7 @@ public class SimpleQueryTests extends AbstractIntegrationTest {
assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2"));
assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("3"));
searchResponse = client().prepareSearch().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND).analyzer("standard")).execute().actionGet();
searchResponse = client().prepareSearch().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND).analyzer("stop")).execute().actionGet();
assertThat(searchResponse.getHits().totalHits(), equalTo(3l));
// standard drops "the" since its a stopword
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
@ -730,8 +730,13 @@ public class SimpleQueryTests extends AbstractIntegrationTest {
}
@Test
public void testMatchQueryZeroTermsQuery() {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 1)).execute().actionGet();
public void testMatchQueryZeroTermsQuery() throws IOException {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 1))
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("field1").field("type", "string").field("analyzer", "classic").endObject()
.startObject("field2").field("type", "string").field("analyzer", "classic").endObject()
.endObject().endObject().endObject())
.execute().actionGet();
client().prepareIndex("test", "type1", "1").setSource("field1", "value1").execute().actionGet();
client().prepareIndex("test", "type1", "2").setSource("field1", "value2").execute().actionGet();
client().admin().indices().prepareRefresh("test").execute().actionGet();
@ -761,9 +766,13 @@ public class SimpleQueryTests extends AbstractIntegrationTest {
}
@Test
public void testMultiMatchQueryZeroTermsQuery() {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 1)).execute().actionGet();
client().prepareIndex("test", "type1", "1").setSource("field1", "value1", "field2", "value2").execute().actionGet();
public void testMultiMatchQueryZeroTermsQuery() throws IOException {
client().admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 1))
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("field1").field("type", "string").field("analyzer", "classic").endObject()
.startObject("field2").field("type", "string").field("analyzer", "classic").endObject()
.endObject().endObject().endObject())
.execute().actionGet(); client().prepareIndex("test", "type1", "1").setSource("field1", "value1", "field2", "value2").execute().actionGet();
client().prepareIndex("test", "type1", "2").setSource("field1", "value3", "field2", "value4").execute().actionGet();
client().admin().indices().prepareRefresh("test").execute().actionGet();

View File

@ -334,7 +334,7 @@ public class CompletionSuggestSearchTests extends AbstractIntegrationTest {
@Test
public void testThatDisablingPositionIncrementsWorkForStopwords() throws Exception {
// analyzer which removes stopwords... so may not be the simple one
createIndexAndMapping("standard", "standard", false, false, false);
createIndexAndMapping("classic", "classic", false, false, false);
client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder()
.startObject().startObject(FIELD)

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.IndexSearcher;
@ -291,7 +292,7 @@ public abstract class AbstractTermVectorTests extends AbstractIntegrationTest {
});
}
}
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.CURRENT.luceneVersion), mapping);
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.CURRENT.luceneVersion, CharArraySet.EMPTY_SET), mapping);
Directory dir = new RAMDirectory();
IndexWriterConfig conf = new IndexWriterConfig(Version.CURRENT.luceneVersion, wrapper);