From 4a15106d6a76d7e506c4966b6b5bafceca3853c9 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Wed, 14 Aug 2013 15:36:26 +0200 Subject: [PATCH] Improve backwards compatibility handling for NGram / EdgeNGram analysis The '"side" : "back"' parameter is not supported anymore on EdgeNGramTokenizer if the mapping is created with 0.90.2 / Lucene 4.3 The 'EdgeNgramTokenFilter' handles this automatically wrapping the 'EdgeNgramTokenFilter' in a 'ReverseTokenFilter' yet with tokenizers this optimization is not possible. This commit also add a more verbose error message how to work around this limitation. Closes #3489 --- .../analysis/AbstractTokenizerFactory.java | 1 + .../analysis/EdgeNGramTokenFilterFactory.java | 18 ++- .../analysis/EdgeNGramTokenizerFactory.java | 20 ++- .../index/analysis/NGramTokenizerFactory.java | 11 +- .../analysis/NGramTokenizerFactoryTests.java | 147 ++++++++++++++++++ 5 files changed, 185 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/AbstractTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/AbstractTokenizerFactory.java index e5b1853202b..c93b4272322 100644 --- a/src/main/java/org/elasticsearch/index/analysis/AbstractTokenizerFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/AbstractTokenizerFactory.java @@ -34,6 +34,7 @@ public abstract class AbstractTokenizerFactory extends AbstractIndexComponent im protected final Version version; + public AbstractTokenizerFactory(Index index, @IndexSettings Settings indexSettings, String name, Settings settings) { super(index, indexSettings); this.name = name; diff --git a/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java index cfafe214f39..263023c7437 100644 --- a/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java @@ -19,13 +19,14 @@ package org.elasticsearch.index.analysis; -import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; - import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ngram.*; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side; +import org.apache.lucene.analysis.ngram.Lucene43EdgeNGramTokenizer; +import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.apache.lucene.util.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; @@ -44,18 +45,25 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory { private final EdgeNGramTokenFilter.Side side; + private org.elasticsearch.Version esVersion; + @Inject public EdgeNGramTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); this.minGram = settings.getAsInt("min_gram", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); this.side = EdgeNGramTokenFilter.Side.getSide(settings.get("side", Lucene43EdgeNGramTokenizer.DEFAULT_SIDE.getLabel())); + this.esVersion = indexSettings.getAsVersion(IndexMetaData.SETTING_VERSION_CREATED, org.elasticsearch.Version.CURRENT); } @Override public TokenStream create(TokenStream tokenStream) { - final Version version = this.version == Version.LUCENE_43 ? Version.LUCENE_44 : this.version; // we supported it since 4.3 - if (version.onOrAfter(Version.LUCENE_43)) { + if (version.onOrAfter(Version.LUCENE_43) && esVersion.onOrAfter(org.elasticsearch.Version.V_0_90_2)) { + /* + * We added this in 0.90.2 but 0.90.1 used LUCENE_43 already so we can not rely on the lucene version. + * Yet if somebody uses 0.90.2 or higher with a prev. lucene version we should also use the deprecated version. + */ + final Version version = this.version == Version.LUCENE_43 ? Version.LUCENE_44 : this.version; // always use 4.4 or higher TokenStream result = tokenStream; // side=BACK is not supported anymore but applying ReverseStringFilter up-front and after the token filter has the same effect if (side == Side.BACK) { diff --git a/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenizerFactory.java index 7386e702d7d..43d3ba86584 100644 --- a/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenizerFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenizerFactory.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.ngram.Lucene43EdgeNGramTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.util.Version; import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; @@ -48,6 +49,9 @@ public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory { private final Lucene43EdgeNGramTokenizer.Side side; private final CharMatcher matcher; + + protected org.elasticsearch.Version esVersion; + @Inject public EdgeNGramTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { @@ -56,17 +60,23 @@ public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory { this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); this.side = Lucene43EdgeNGramTokenizer.Side.getSide(settings.get("side", Lucene43EdgeNGramTokenizer.DEFAULT_SIDE.getLabel())); this.matcher = parseTokenChars(settings.getAsArray("token_chars")); + this.esVersion = indexSettings.getAsVersion(IndexMetaData.SETTING_VERSION_CREATED, org.elasticsearch.Version.CURRENT); } @Override public Tokenizer create(Reader reader) { - final Version version = this.version == Version.LUCENE_43 ? Version.LUCENE_44 : this.version; // we supported it since 4.3 - if (version.onOrAfter(Version.LUCENE_44)) { + if (version.onOrAfter(Version.LUCENE_43) && esVersion.onOrAfter(org.elasticsearch.Version.V_0_90_2)) { + /* + * We added this in 0.90.2 but 0.90.1 used LUCENE_43 already so we can not rely on the lucene version. + * Yet if somebody uses 0.90.2 or higher with a prev. lucene version we should also use the deprecated version. + */ if (side == Lucene43EdgeNGramTokenizer.Side.BACK) { - throw new ElasticSearchIllegalArgumentException("side=BACK is not supported anymore. Please fix your analysis chain or use" - + " an older compatibility version (<=4.2) but beware that it might cause highlighting bugs."); + throw new ElasticSearchIllegalArgumentException("side=back is not supported anymore. Please fix your analysis chain or use" + + " an older compatibility version (<=4.2) but beware that it might cause highlighting bugs." + + " To obtain the same behavior as the previous version please use \"edgeNGram\" filter which still supports side=back" + + " in combination with a \"keyword\" tokenizer"); } - // LUCENE MONITOR: this token filter is a copy from lucene trunk and should go away once we upgrade to lucene 4.4 + final Version version = this.version == Version.LUCENE_43 ? Version.LUCENE_44 : this.version; // always use 4.4 or higher if (matcher == null) { return new EdgeNGramTokenizer(version, reader, minGram, maxGram); } else { diff --git a/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java index f5f31038bd5..b9fc4bd5bad 100644 --- a/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.ngram.Lucene43NGramTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.util.Version; import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; @@ -45,6 +46,7 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory { private final int minGram; private final int maxGram; private final CharMatcher matcher; + private org.elasticsearch.Version esVersion; static final Map MATCHERS; @@ -94,13 +96,18 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory { this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); this.matcher = parseTokenChars(settings.getAsArray("token_chars")); + this.esVersion = indexSettings.getAsVersion(IndexMetaData.SETTING_VERSION_CREATED, org.elasticsearch.Version.CURRENT); } @SuppressWarnings("deprecation") @Override public Tokenizer create(Reader reader) { - final Version version = this.version == Version.LUCENE_43 ? Version.LUCENE_44 : this.version; // we supported it since 4.3 - if (version.onOrAfter(Version.LUCENE_44)) { + if (version.onOrAfter(Version.LUCENE_43) && esVersion.onOrAfter(org.elasticsearch.Version.V_0_90_2)) { + /* + * We added this in 0.90.2 but 0.90.1 used LUCENE_43 already so we can not rely on the lucene version. + * Yet if somebody uses 0.90.2 or higher with a prev. lucene version we should also use the deprecated version. + */ + final Version version = this.version == Version.LUCENE_43 ? Version.LUCENE_44 : this.version; // always use 4.4 or higher if (matcher == null) { return new NGramTokenizer(version, reader, minGram, maxGram); } else { diff --git a/src/test/java/org/elasticsearch/test/unit/index/analysis/NGramTokenizerFactoryTests.java b/src/test/java/org/elasticsearch/test/unit/index/analysis/NGramTokenizerFactoryTests.java index d2b2d51b20a..7e27ae422ec 100644 --- a/src/test/java/org/elasticsearch/test/unit/index/analysis/NGramTokenizerFactoryTests.java +++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/NGramTokenizerFactoryTests.java @@ -22,17 +22,33 @@ package org.elasticsearch.test.unit.index.analysis; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ngram.*; +import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.ImmutableSettings.Builder; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.Index; +import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory; import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory; import org.elasticsearch.index.analysis.NGramTokenizerFactory; import org.junit.Test; import java.io.IOException; import java.io.StringReader; +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; +import java.util.Random; + +import static org.hamcrest.Matchers.instanceOf; @ThreadLeakScope(Scope.NONE) public class NGramTokenizerFactoryTests extends BaseTokenStreamTestCase { @@ -85,5 +101,136 @@ public class NGramTokenizerFactoryTests extends BaseTokenStreamTestCase { assertTokenStreamContents(new EdgeNGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader(" a!$ 9")), new String[] {" a", " a!"}); } + + @Test + public void testBackwardsCompatibilityEdgeNgramTokenizer() throws IllegalArgumentException, IllegalAccessException { + int iters = atLeast(20); + final Index index = new Index("test"); + final String name = "ngr"; + for (int i = 0; i < iters; i++) { + Version v = randomVersion(random()); + if (v.onOrAfter(Version.V_0_90_2)) { + Builder builder = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit"); + boolean compatVersion = false; + if ((compatVersion = random().nextBoolean())) { + builder.put("version", "4." + random().nextInt(3)); + builder.put("side", "back"); + } + Settings settings = builder.build(); + Settings indexSettings = ImmutableSettings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id).build(); + Tokenizer edgeNGramTokenizer = new EdgeNGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader( + "foo bar")); + if (compatVersion) { + assertThat(edgeNGramTokenizer, instanceOf(Lucene43EdgeNGramTokenizer.class)); + } else { + assertThat(edgeNGramTokenizer, instanceOf(EdgeNGramTokenizer.class)); + } + + } else { + Settings settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("side", "back").build(); + Settings indexSettings = ImmutableSettings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id).build(); + Tokenizer edgeNGramTokenizer = new EdgeNGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader( + "foo bar")); + assertThat(edgeNGramTokenizer, instanceOf(Lucene43EdgeNGramTokenizer.class)); + } + } + Settings settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("side", "back").build(); + Settings indexSettings = ImmutableSettings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); + try { + new EdgeNGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader("foo bar")); + fail("should fail side:back is not supported anymore"); + } catch (ElasticSearchIllegalArgumentException ex) { + } + + } + + @Test + public void testBackwardsCompatibilityNgramTokenizer() throws IllegalArgumentException, IllegalAccessException { + int iters = atLeast(20); + for (int i = 0; i < iters; i++) { + final Index index = new Index("test"); + final String name = "ngr"; + Version v = randomVersion(random()); + if (v.onOrAfter(Version.V_0_90_2)) { + Builder builder = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit"); + boolean compatVersion = false; + if ((compatVersion = random().nextBoolean())) { + builder.put("version", "4." + random().nextInt(3)); + } + Settings settings = builder.build(); + Settings indexSettings = ImmutableSettings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id).build(); + Tokenizer nGramTokenizer = new NGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader( + "foo bar")); + if (compatVersion) { + assertThat(nGramTokenizer, instanceOf(Lucene43NGramTokenizer.class)); + } else { + assertThat(nGramTokenizer, instanceOf(NGramTokenizer.class)); + } + + } else { + Settings settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).build(); + Settings indexSettings = ImmutableSettings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id).build(); + Tokenizer nGramTokenizer = new NGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader( + "foo bar")); + assertThat(nGramTokenizer, instanceOf(Lucene43NGramTokenizer.class)); + } + } + } + + @Test + public void testBackwardsCompatibilityEdgeNgramTokenFilter() throws IllegalArgumentException, IllegalAccessException { + int iters = atLeast(20); + for (int i = 0; i < iters; i++) { + final Index index = new Index("test"); + final String name = "ngr"; + Version v = randomVersion(random()); + if (v.onOrAfter(Version.V_0_90_2)) { + Builder builder = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3); + boolean compatVersion = false; + if ((compatVersion = random().nextBoolean())) { + builder.put("version", "4." + random().nextInt(3)); + } + boolean reverse = random().nextBoolean(); + if (reverse) { + builder.put("side", "back"); + } + Settings settings = builder.build(); + Settings indexSettings = ImmutableSettings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id).build(); + TokenStream edgeNGramTokenFilter = new EdgeNGramTokenFilterFactory(index, indexSettings, name, settings).create(new MockTokenizer(new StringReader( + "foo bar"))); + if (compatVersion) { + assertThat(edgeNGramTokenFilter, instanceOf(EdgeNGramTokenFilter.class)); + } else if (reverse && !compatVersion){ + assertThat(edgeNGramTokenFilter, instanceOf(ReverseStringFilter.class)); + } else { + assertThat(edgeNGramTokenFilter, instanceOf(EdgeNGramTokenFilter.class)); + } + + } else { + Builder builder = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3); + boolean reverse = random().nextBoolean(); + if (reverse) { + builder.put("side", "back"); + } + Settings settings = builder.build(); + Settings indexSettings = ImmutableSettings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id).build(); + TokenStream edgeNGramTokenFilter = new EdgeNGramTokenFilterFactory(index, indexSettings, name, settings).create(new MockTokenizer(new StringReader( + "foo bar"))); + assertThat(edgeNGramTokenFilter, instanceOf(EdgeNGramTokenFilter.class)); + } + } + } + + + private Version randomVersion(Random random) throws IllegalArgumentException, IllegalAccessException { + Field[] declaredFields = Version.class.getDeclaredFields(); + List versionFields = new ArrayList(); + for (Field field : declaredFields) { + if ((field.getModifiers() & Modifier.STATIC) != 0 && field.getName().startsWith("V_") && field.getType() == Version.class) { + versionFields.add(field); + } + } + return (Version) versionFields.get(random.nextInt(versionFields.size())).get(Version.class); + } }