Default stopwords list should be `_none_` for all but language-specific analyzers
`standard_html_strip` and `pattern` analyzer support stopwords which are set to the default `english` stopwords by default. Those analyzers should not use stopwords by default since they are language neutral Closes #4699
This commit is contained in:
parent
0916372520
commit
7f63ddf94e
|
@ -13,6 +13,9 @@ type:
|
||||||
|`lowercase` |Should terms be lowercased or not. Defaults to `true`.
|
|`lowercase` |Should terms be lowercased or not. Defaults to `true`.
|
||||||
|`pattern` |The regular expression pattern, defaults to `\W+`.
|
|`pattern` |The regular expression pattern, defaults to `\W+`.
|
||||||
|`flags` |The regular expression flags.
|
|`flags` |The regular expression flags.
|
||||||
|
|`stopwords` |A list of stopwords to initialize the stop filter with.
|
||||||
|
Defaults to an 'empty' stopword list coming[1.0.0.RC1, Previously
|
||||||
|
defaulted to the English stopwords list]
|
||||||
|===================================================================
|
|===================================================================
|
||||||
|
|
||||||
*IMPORTANT*: The regular expression should match the *token separators*,
|
*IMPORTANT*: The regular expression should match the *token separators*,
|
||||||
|
|
|
@ -23,6 +23,8 @@ import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
|
import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.elasticsearch.ElasticsearchIllegalArgumentException;
|
import org.elasticsearch.ElasticsearchIllegalArgumentException;
|
||||||
|
import org.elasticsearch.Version;
|
||||||
|
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||||
import org.elasticsearch.common.inject.Inject;
|
import org.elasticsearch.common.inject.Inject;
|
||||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
import org.elasticsearch.common.regex.Regex;
|
import org.elasticsearch.common.regex.Regex;
|
||||||
|
@ -44,9 +46,15 @@ public class PatternAnalyzerProvider extends AbstractIndexAnalyzerProvider<Patte
|
||||||
public PatternAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
|
public PatternAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
|
||||||
super(index, indexSettings, name, settings);
|
super(index, indexSettings, name, settings);
|
||||||
|
|
||||||
|
Version esVersion = indexSettings.getAsVersion(IndexMetaData.SETTING_VERSION_CREATED, org.elasticsearch.Version.CURRENT);
|
||||||
|
final CharArraySet defaultStopwords;
|
||||||
|
if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
|
||||||
|
defaultStopwords = CharArraySet.EMPTY_SET;
|
||||||
|
} else {
|
||||||
|
defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||||
|
}
|
||||||
boolean lowercase = settings.getAsBoolean("lowercase", true);
|
boolean lowercase = settings.getAsBoolean("lowercase", true);
|
||||||
|
CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords, version);
|
||||||
CharArraySet stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET, version);
|
|
||||||
|
|
||||||
String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
|
String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
|
||||||
if (sPattern == null) {
|
if (sPattern == null) {
|
||||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.core.StopFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -34,17 +35,28 @@ import java.io.Reader;
|
||||||
|
|
||||||
public class StandardHtmlStripAnalyzer extends StopwordAnalyzerBase {
|
public class StandardHtmlStripAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated use {@link StandardHtmlStripAnalyzer#StandardHtmlStripAnalyzer(org.apache.lucene.util.Version,
|
||||||
|
* org.apache.lucene.analysis.util.CharArraySet)} instead
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
public StandardHtmlStripAnalyzer(Version version) {
|
public StandardHtmlStripAnalyzer(Version version) {
|
||||||
super(version, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
super(version, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public StandardHtmlStripAnalyzer(Version version, CharArraySet stopwords) {
|
||||||
|
super(version, stopwords);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
|
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
|
||||||
final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
|
final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
|
||||||
src.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
|
src.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
TokenStream tok = new StandardFilter(matchVersion, src);
|
TokenStream tok = new StandardFilter(matchVersion, src);
|
||||||
tok = new LowerCaseFilter(matchVersion, tok);
|
tok = new LowerCaseFilter(matchVersion, tok);
|
||||||
tok = new StopFilter(matchVersion, tok, stopwords);
|
if (!stopwords.isEmpty()) {
|
||||||
|
tok = new StopFilter(matchVersion, tok, stopwords);
|
||||||
|
}
|
||||||
return new TokenStreamComponents(src, tok) {
|
return new TokenStreamComponents(src, tok) {
|
||||||
@Override
|
@Override
|
||||||
protected void setReader(final Reader reader) throws IOException {
|
protected void setReader(final Reader reader) throws IOException {
|
||||||
|
|
|
@ -19,9 +19,14 @@
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
import org.elasticsearch.Version;
|
||||||
|
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||||
import org.elasticsearch.common.inject.Inject;
|
import org.elasticsearch.common.inject.Inject;
|
||||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.Index;
|
import org.elasticsearch.index.Index;
|
||||||
import org.elasticsearch.index.settings.IndexSettings;
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
|
@ -31,11 +36,20 @@ import org.elasticsearch.index.settings.IndexSettings;
|
||||||
public class StandardHtmlStripAnalyzerProvider extends AbstractIndexAnalyzerProvider<StandardHtmlStripAnalyzer> {
|
public class StandardHtmlStripAnalyzerProvider extends AbstractIndexAnalyzerProvider<StandardHtmlStripAnalyzer> {
|
||||||
|
|
||||||
private final StandardHtmlStripAnalyzer analyzer;
|
private final StandardHtmlStripAnalyzer analyzer;
|
||||||
|
private final Version esVersion;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public StandardHtmlStripAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
public StandardHtmlStripAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
|
||||||
super(index, indexSettings, name, settings);
|
super(index, indexSettings, name, settings);
|
||||||
analyzer = new StandardHtmlStripAnalyzer(version);
|
this.esVersion = indexSettings.getAsVersion(IndexMetaData.SETTING_VERSION_CREATED, org.elasticsearch.Version.CURRENT);
|
||||||
|
final CharArraySet defaultStopwords;
|
||||||
|
if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
|
||||||
|
defaultStopwords = CharArraySet.EMPTY_SET;
|
||||||
|
} else {
|
||||||
|
defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||||
|
}
|
||||||
|
CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords, version);
|
||||||
|
analyzer = new StandardHtmlStripAnalyzer(version, stopWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -131,16 +131,22 @@ public enum PreBuiltAnalyzers {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
PATTERN {
|
PATTERN(CachingStrategy.ELASTICSEARCH) {
|
||||||
@Override
|
@Override
|
||||||
protected Analyzer create(Version version) {
|
protected Analyzer create(Version version) {
|
||||||
|
if (version.onOrAfter(Version.V_1_0_0_RC1)) {
|
||||||
|
return new PatternAnalyzer(version.luceneVersion, Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
return new PatternAnalyzer(version.luceneVersion, Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
return new PatternAnalyzer(version.luceneVersion, Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
STANDARD_HTML_STRIP {
|
STANDARD_HTML_STRIP(CachingStrategy.ELASTICSEARCH) {
|
||||||
@Override
|
@Override
|
||||||
protected Analyzer create(Version version) {
|
protected Analyzer create(Version version) {
|
||||||
|
if (version.onOrAfter(Version.V_1_0_0_RC1)) {
|
||||||
|
return new StandardHtmlStripAnalyzer(version.luceneVersion, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
return new StandardHtmlStripAnalyzer(version.luceneVersion);
|
return new StandardHtmlStripAnalyzer(version.luceneVersion);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -0,0 +1,69 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||||
|
import org.elasticsearch.test.ElasticsearchTokenStreamTestCase;
|
||||||
|
import org.junit.Ignore;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_VERSION_CREATED;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*/
|
||||||
|
public class AnalyzerBackwardsCompatTests extends ElasticsearchTokenStreamTestCase {
|
||||||
|
|
||||||
|
@Ignore
|
||||||
|
private void testNoStopwordsAfter(org.elasticsearch.Version noStopwordVersion, String type) throws IOException {
|
||||||
|
final int iters = atLeast(10);
|
||||||
|
org.elasticsearch.Version version = org.elasticsearch.Version.CURRENT;
|
||||||
|
for (int i = 0; i < iters; i++) {
|
||||||
|
ImmutableSettings.Builder builder = ImmutableSettings.settingsBuilder().put("index.analysis.filter.my_stop.type", "stop");
|
||||||
|
if (version.onOrAfter(noStopwordVersion)) {
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
builder.put(SETTING_VERSION_CREATED, version);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
builder.put(SETTING_VERSION_CREATED, version);
|
||||||
|
}
|
||||||
|
builder.put("index.analysis.analyzer.foo.type", type);
|
||||||
|
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(builder.build());
|
||||||
|
NamedAnalyzer analyzer = analysisService.analyzer("foo");
|
||||||
|
if (version.onOrAfter(noStopwordVersion)) {
|
||||||
|
assertAnalyzesTo(analyzer, "this is bogus", new String[]{"this", "is", "bogus"});
|
||||||
|
} else {
|
||||||
|
assertAnalyzesTo(analyzer, "this is bogus", new String[]{"bogus"});
|
||||||
|
}
|
||||||
|
version = randomVersion();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPatternAnalyzer() throws IOException {
|
||||||
|
testNoStopwordsAfter(org.elasticsearch.Version.V_1_0_0_RC1, "pattern");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testStandardHTMLStripAnalyzer() throws IOException {
|
||||||
|
testNoStopwordsAfter(org.elasticsearch.Version.V_1_0_0_RC1, "standard_html_strip");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testStandardAnalyzer() throws IOException {
|
||||||
|
testNoStopwordsAfter(org.elasticsearch.Version.V_1_0_0_Beta1, "standard");
|
||||||
|
}
|
||||||
|
}
|
|
@ -91,6 +91,41 @@ public class PreBuiltAnalyzerTests extends ElasticsearchTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAnalyzerChangedIn10RC1() throws IOException {
|
||||||
|
Analyzer pattern = PreBuiltAnalyzers.PATTERN.getAnalyzer(Version.V_1_0_0_RC1);
|
||||||
|
Analyzer standardHtml = PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(Version.V_1_0_0_RC1);
|
||||||
|
final int n = atLeast(10);
|
||||||
|
Version version = Version.CURRENT;
|
||||||
|
for(int i = 0; i < n; i++) {
|
||||||
|
if (version.equals(Version.V_1_0_0_RC1)) {
|
||||||
|
assertThat(pattern, is(PreBuiltAnalyzers.PATTERN.getAnalyzer(version)));
|
||||||
|
assertThat(standardHtml, is(PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(version)));
|
||||||
|
} else {
|
||||||
|
assertThat(pattern, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version))));
|
||||||
|
assertThat(standardHtml, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version))));
|
||||||
|
}
|
||||||
|
Analyzer analyzer = randomBoolean() ? PreBuiltAnalyzers.PATTERN.getAnalyzer(version) : PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(version);
|
||||||
|
TokenStream ts = analyzer.tokenStream("foo", "This is it Dude");
|
||||||
|
ts.reset();
|
||||||
|
CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
List<String> list = new ArrayList<String>();
|
||||||
|
while(ts.incrementToken()) {
|
||||||
|
list.add(charTermAttribute.toString());
|
||||||
|
}
|
||||||
|
if (version.onOrAfter(Version.V_1_0_0_RC1)) {
|
||||||
|
assertThat(list.toString(), list.size(), is(4));
|
||||||
|
assertThat(list, contains("this", "is", "it", "dude"));
|
||||||
|
|
||||||
|
} else {
|
||||||
|
assertThat(list.size(), is(1));
|
||||||
|
assertThat(list, contains("dude"));
|
||||||
|
}
|
||||||
|
ts.close();
|
||||||
|
version = randomVersion();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testThatInstancesAreTheSameAlwaysForKeywordAnalyzer() {
|
public void testThatInstancesAreTheSameAlwaysForKeywordAnalyzer() {
|
||||||
assertThat(PreBuiltAnalyzers.KEYWORD.getAnalyzer(Version.CURRENT),
|
assertThat(PreBuiltAnalyzers.KEYWORD.getAnalyzer(Version.CURRENT),
|
||||||
|
|
|
@ -39,6 +39,7 @@ import org.elasticsearch.index.analysis.AnalysisService;
|
||||||
import org.elasticsearch.index.settings.IndexSettingsModule;
|
import org.elasticsearch.index.settings.IndexSettingsModule;
|
||||||
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
|
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
|
||||||
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
|
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
|
||||||
|
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||||
import org.hamcrest.MatcherAssert;
|
import org.hamcrest.MatcherAssert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
@ -49,7 +50,7 @@ import static org.hamcrest.Matchers.equalTo;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*/
|
*/
|
||||||
public class SynonymsAnalysisTest {
|
public class SynonymsAnalysisTest extends ElasticsearchTestCase {
|
||||||
|
|
||||||
protected final ESLogger logger = Loggers.getLogger(getClass());
|
protected final ESLogger logger = Loggers.getLogger(getClass());
|
||||||
private AnalysisService analysisService;
|
private AnalysisService analysisService;
|
||||||
|
|
|
@ -19,10 +19,14 @@
|
||||||
|
|
||||||
package org.elasticsearch.test;
|
package org.elasticsearch.test;
|
||||||
|
|
||||||
import com.carrotsearch.randomizedtesting.annotations.*;
|
import com.carrotsearch.randomizedtesting.annotations.Listeners;
|
||||||
|
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
|
||||||
|
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
|
||||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope;
|
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope;
|
||||||
|
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.TimeUnits;
|
import org.apache.lucene.util.TimeUnits;
|
||||||
|
import org.elasticsearch.Version;
|
||||||
import org.elasticsearch.test.junit.listeners.ReproduceInfoPrinter;
|
import org.elasticsearch.test.junit.listeners.ReproduceInfoPrinter;
|
||||||
|
|
||||||
@Listeners({
|
@Listeners({
|
||||||
|
@ -38,4 +42,7 @@ import org.elasticsearch.test.junit.listeners.ReproduceInfoPrinter;
|
||||||
*/
|
*/
|
||||||
public abstract class ElasticsearchTokenStreamTestCase extends BaseTokenStreamTestCase {
|
public abstract class ElasticsearchTokenStreamTestCase extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
public static Version randomVersion() {
|
||||||
|
return ElasticsearchTestCase.randomVersion(random());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue