From 7f63ddf94ef6066377f83ac81ecf2885775b01f7 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 13 Jan 2014 14:21:26 +0100 Subject: [PATCH] Default stopwords list should be `_none_` for all but language-specific analyzers `standard_html_strip` and `pattern` analyzer support stopwords which are set to the default `english` stopwords by default. Those analyzers should not use stopwords by default since they are language neutral Closes #4699 --- .../analyzers/pattern-analyzer.asciidoc | 3 + .../analysis/PatternAnalyzerProvider.java | 12 +++- .../analysis/StandardHtmlStripAnalyzer.java | 14 +++- .../StandardHtmlStripAnalyzerProvider.java | 18 ++++- .../indices/analysis/PreBuiltAnalyzers.java | 10 ++- .../AnalyzerBackwardsCompatTests.java | 69 +++++++++++++++++++ .../index/analysis/PreBuiltAnalyzerTests.java | 35 ++++++++++ .../synonyms/SynonymsAnalysisTest.java | 3 +- .../ElasticsearchTokenStreamTestCase.java | 9 ++- 9 files changed, 164 insertions(+), 9 deletions(-) create mode 100644 src/test/java/org/elasticsearch/index/analysis/AnalyzerBackwardsCompatTests.java diff --git a/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc b/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc index 97464677331..04f71bfc773 100644 --- a/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc @@ -13,6 +13,9 @@ type: |`lowercase` |Should terms be lowercased or not. Defaults to `true`. |`pattern` |The regular expression pattern, defaults to `\W+`. |`flags` |The regular expression flags. +|`stopwords` |A list of stopwords to initialize the stop filter with. +Defaults to an 'empty' stopword list coming[1.0.0.RC1, Previously +defaulted to the English stopwords list] |=================================================================== *IMPORTANT*: The regular expression should match the *token separators*, diff --git a/src/main/java/org/elasticsearch/index/analysis/PatternAnalyzerProvider.java b/src/main/java/org/elasticsearch/index/analysis/PatternAnalyzerProvider.java index 4885c7fc97e..af053b3d47f 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PatternAnalyzerProvider.java +++ b/src/main/java/org/elasticsearch/index/analysis/PatternAnalyzerProvider.java @@ -23,6 +23,8 @@ import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer; import org.apache.lucene.analysis.util.CharArraySet; import org.elasticsearch.ElasticsearchIllegalArgumentException; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.regex.Regex; @@ -44,9 +46,15 @@ public class PatternAnalyzerProvider extends AbstractIndexAnalyzerProvider { private final StandardHtmlStripAnalyzer analyzer; + private final Version esVersion; @Inject - public StandardHtmlStripAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + public StandardHtmlStripAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); - analyzer = new StandardHtmlStripAnalyzer(version); + this.esVersion = indexSettings.getAsVersion(IndexMetaData.SETTING_VERSION_CREATED, org.elasticsearch.Version.CURRENT); + final CharArraySet defaultStopwords; + if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) { + defaultStopwords = CharArraySet.EMPTY_SET; + } else { + defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; + } + CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords, version); + analyzer = new StandardHtmlStripAnalyzer(version, stopWords); } @Override diff --git a/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java index 6f5fe8ae3c6..0f5599795a7 100644 --- a/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java +++ b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java @@ -131,16 +131,22 @@ public enum PreBuiltAnalyzers { } }, - PATTERN { + PATTERN(CachingStrategy.ELASTICSEARCH) { @Override protected Analyzer create(Version version) { + if (version.onOrAfter(Version.V_1_0_0_RC1)) { + return new PatternAnalyzer(version.luceneVersion, Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true, CharArraySet.EMPTY_SET); + } return new PatternAnalyzer(version.luceneVersion, Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET); } }, - STANDARD_HTML_STRIP { + STANDARD_HTML_STRIP(CachingStrategy.ELASTICSEARCH) { @Override protected Analyzer create(Version version) { + if (version.onOrAfter(Version.V_1_0_0_RC1)) { + return new StandardHtmlStripAnalyzer(version.luceneVersion, CharArraySet.EMPTY_SET); + } return new StandardHtmlStripAnalyzer(version.luceneVersion); } }, diff --git a/src/test/java/org/elasticsearch/index/analysis/AnalyzerBackwardsCompatTests.java b/src/test/java/org/elasticsearch/index/analysis/AnalyzerBackwardsCompatTests.java new file mode 100644 index 00000000000..3a230866384 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/AnalyzerBackwardsCompatTests.java @@ -0,0 +1,69 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.test.ElasticsearchTokenStreamTestCase; +import org.junit.Ignore; + +import java.io.IOException; + +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_VERSION_CREATED; + +/** + */ +public class AnalyzerBackwardsCompatTests extends ElasticsearchTokenStreamTestCase { + + @Ignore + private void testNoStopwordsAfter(org.elasticsearch.Version noStopwordVersion, String type) throws IOException { + final int iters = atLeast(10); + org.elasticsearch.Version version = org.elasticsearch.Version.CURRENT; + for (int i = 0; i < iters; i++) { + ImmutableSettings.Builder builder = ImmutableSettings.settingsBuilder().put("index.analysis.filter.my_stop.type", "stop"); + if (version.onOrAfter(noStopwordVersion)) { + if (random().nextBoolean()) { + builder.put(SETTING_VERSION_CREATED, version); + } + } else { + builder.put(SETTING_VERSION_CREATED, version); + } + builder.put("index.analysis.analyzer.foo.type", type); + AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(builder.build()); + NamedAnalyzer analyzer = analysisService.analyzer("foo"); + if (version.onOrAfter(noStopwordVersion)) { + assertAnalyzesTo(analyzer, "this is bogus", new String[]{"this", "is", "bogus"}); + } else { + assertAnalyzesTo(analyzer, "this is bogus", new String[]{"bogus"}); + } + version = randomVersion(); + } + } + + public void testPatternAnalyzer() throws IOException { + testNoStopwordsAfter(org.elasticsearch.Version.V_1_0_0_RC1, "pattern"); + } + + public void testStandardHTMLStripAnalyzer() throws IOException { + testNoStopwordsAfter(org.elasticsearch.Version.V_1_0_0_RC1, "standard_html_strip"); + } + + public void testStandardAnalyzer() throws IOException { + testNoStopwordsAfter(org.elasticsearch.Version.V_1_0_0_Beta1, "standard"); + } +} diff --git a/src/test/java/org/elasticsearch/index/analysis/PreBuiltAnalyzerTests.java b/src/test/java/org/elasticsearch/index/analysis/PreBuiltAnalyzerTests.java index 3afc5335322..7719df60c84 100644 --- a/src/test/java/org/elasticsearch/index/analysis/PreBuiltAnalyzerTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/PreBuiltAnalyzerTests.java @@ -91,6 +91,41 @@ public class PreBuiltAnalyzerTests extends ElasticsearchTestCase { } } + @Test + public void testAnalyzerChangedIn10RC1() throws IOException { + Analyzer pattern = PreBuiltAnalyzers.PATTERN.getAnalyzer(Version.V_1_0_0_RC1); + Analyzer standardHtml = PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(Version.V_1_0_0_RC1); + final int n = atLeast(10); + Version version = Version.CURRENT; + for(int i = 0; i < n; i++) { + if (version.equals(Version.V_1_0_0_RC1)) { + assertThat(pattern, is(PreBuiltAnalyzers.PATTERN.getAnalyzer(version))); + assertThat(standardHtml, is(PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(version))); + } else { + assertThat(pattern, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version)))); + assertThat(standardHtml, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version)))); + } + Analyzer analyzer = randomBoolean() ? PreBuiltAnalyzers.PATTERN.getAnalyzer(version) : PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(version); + TokenStream ts = analyzer.tokenStream("foo", "This is it Dude"); + ts.reset(); + CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); + List list = new ArrayList(); + while(ts.incrementToken()) { + list.add(charTermAttribute.toString()); + } + if (version.onOrAfter(Version.V_1_0_0_RC1)) { + assertThat(list.toString(), list.size(), is(4)); + assertThat(list, contains("this", "is", "it", "dude")); + + } else { + assertThat(list.size(), is(1)); + assertThat(list, contains("dude")); + } + ts.close(); + version = randomVersion(); + } + } + @Test public void testThatInstancesAreTheSameAlwaysForKeywordAnalyzer() { assertThat(PreBuiltAnalyzers.KEYWORD.getAnalyzer(Version.CURRENT), diff --git a/src/test/java/org/elasticsearch/index/analysis/synonyms/SynonymsAnalysisTest.java b/src/test/java/org/elasticsearch/index/analysis/synonyms/SynonymsAnalysisTest.java index e35ba23dcbf..a578e39d359 100644 --- a/src/test/java/org/elasticsearch/index/analysis/synonyms/SynonymsAnalysisTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/synonyms/SynonymsAnalysisTest.java @@ -39,6 +39,7 @@ import org.elasticsearch.index.analysis.AnalysisService; import org.elasticsearch.index.settings.IndexSettingsModule; import org.elasticsearch.indices.analysis.IndicesAnalysisModule; import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.elasticsearch.test.ElasticsearchTestCase; import org.hamcrest.MatcherAssert; import org.junit.Test; @@ -49,7 +50,7 @@ import static org.hamcrest.Matchers.equalTo; /** */ -public class SynonymsAnalysisTest { +public class SynonymsAnalysisTest extends ElasticsearchTestCase { protected final ESLogger logger = Loggers.getLogger(getClass()); private AnalysisService analysisService; diff --git a/src/test/java/org/elasticsearch/test/ElasticsearchTokenStreamTestCase.java b/src/test/java/org/elasticsearch/test/ElasticsearchTokenStreamTestCase.java index 2e102702942..d42093c4031 100644 --- a/src/test/java/org/elasticsearch/test/ElasticsearchTokenStreamTestCase.java +++ b/src/test/java/org/elasticsearch/test/ElasticsearchTokenStreamTestCase.java @@ -19,10 +19,14 @@ package org.elasticsearch.test; -import com.carrotsearch.randomizedtesting.annotations.*; +import com.carrotsearch.randomizedtesting.annotations.Listeners; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope; +import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.util.TimeUnits; +import org.elasticsearch.Version; import org.elasticsearch.test.junit.listeners.ReproduceInfoPrinter; @Listeners({ @@ -38,4 +42,7 @@ import org.elasticsearch.test.junit.listeners.ReproduceInfoPrinter; */ public abstract class ElasticsearchTokenStreamTestCase extends BaseTokenStreamTestCase { + public static Version randomVersion() { + return ElasticsearchTestCase.randomVersion(random()); + } }