From 4480762bf85b111d58fb1737bd32117ceb1cab15 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Mon, 5 Dec 2011 13:31:59 +0200 Subject: [PATCH 001/131] first commit --- .gitignore | 7 + README.md | 15 ++ pom.xml | 128 ++++++++++++++++++ src/main/assemblies/plugin.xml | 26 ++++ .../analysis/IcuAnalysisBinderProcessor.java | 38 ++++++ .../IcuCollationTokenFilterFactory.java | 106 +++++++++++++++ .../IcuFoldingTokenFilterFactory.java | 45 ++++++ .../IcuNormalizerTokenFilterFactory.java | 52 +++++++ .../analysis/icu/AnalysisICUPlugin.java | 49 +++++++ src/main/resources/es-plugin.properties | 1 + .../analysis/SimpleIcuAnalysisTests.java | 59 ++++++++ src/test/resources/log4j.properties | 5 + 12 files changed, 531 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 pom.xml create mode 100644 src/main/assemblies/plugin.xml create mode 100644 src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java create mode 100644 src/main/resources/es-plugin.properties create mode 100644 src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java create mode 100644 src/test/resources/log4j.properties diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000000..06a1e6fedb6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +/data +/work +/logs +/.idea +/target +.DS_Store +*.iml diff --git a/README.md b/README.md new file mode 100644 index 00000000000..50bb848c982 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +ICU Analysis for ElasticSearch +================================== + +The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. + +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.0.0`. + + --------------------------------------- + | memcached Plugin | ElasticSearch | + --------------------------------------- + | master | 0.18 -> master | + --------------------------------------- + | 1.0.0 | 0.18 -> master | + --------------------------------------- + diff --git a/pom.xml b/pom.xml new file mode 100644 index 00000000000..da5bdfd0d97 --- /dev/null +++ b/pom.xml @@ -0,0 +1,128 @@ + + + elasticsearch-analysis-icu + 4.0.0 + org.elasticsearch + elasticsearch-analysis-icu + 1.0.0 + jar + ICU Analysis for ElasticSearch + 2009 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + scm:git:git@github.com:elasticsearch/elasticsearch-analysis-icu.git + scm:git:git@github.com:elasticsearch/elasticsearch-analysis-icu.git + + http://github.com/elasticsearch/elasticsearch-analysis-icu + + + + org.sonatype.oss + oss-parent + 7 + + + + 0.18.5 + + + + + + + + org.elasticsearch + elasticsearch + ${elasticsearch.version} + compile + + + + org.apache.lucene + lucene-icu + 3.5.0 + compile + + + + log4j + log4j + 1.2.16 + runtime + + + + org.testng + testng + 6.3.1 + test + + + + org.hamcrest + hamcrest-core + 1.3.RC2 + test + + + + org.hamcrest + hamcrest-library + 1.3.RC2 + test + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + 1.6 + 1.6 + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.11 + + + **/*Tests.java + + + + + org.apache.maven.plugins + maven-source-plugin + 2.1.2 + + + attach-sources + + jar + + + + + + maven-assembly-plugin + + + ${basedir}/src/main/assemblies/plugin.xml + + + + + + \ No newline at end of file diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml new file mode 100644 index 00000000000..c5c3a71957b --- /dev/null +++ b/src/main/assemblies/plugin.xml @@ -0,0 +1,26 @@ + + + + + zip + + false + + + / + true + true + + org.elasticsearch:elasticsearch + + + + / + true + true + + org.apache.lucene:lucene-icu + + + + \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java new file mode 100644 index 00000000000..111607cbea6 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java @@ -0,0 +1,38 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +/** + * + */ +public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { + + @Override + public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { + tokenFiltersBindings.processTokenFilter("icuNormalizer", IcuNormalizerTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("icu_normalizer", IcuNormalizerTokenFilterFactory.class); + + tokenFiltersBindings.processTokenFilter("icuFolding", IcuFoldingTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("icu_folding", IcuFoldingTokenFilterFactory.class); + + tokenFiltersBindings.processTokenFilter("icuCollation", IcuCollationTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("icu_collation", IcuCollationTokenFilterFactory.class); + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java new file mode 100644 index 00000000000..3517c01b695 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -0,0 +1,106 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RuleBasedCollator; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.collation.ICUCollationKeyFilter; +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.FailedToResolveConfigException; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.io.IOException; +import java.util.Locale; + +/** + * An ICU based collation token filter. There are two ways to configure collation: + *

+ *

The first is simply specifying the locale (defaults to the default locale). The language + * parameter is the lowercase two-letter ISO-639 code. An additional country and variant + * can be provided. + *

+ *

The second option is to specify collation rules as defined in the + * Collation customization chapter in icu docs. The rules parameter can either embed the rules definition + * in the settings or refer to an external location (preferable located under the config location, relative to it). + * + * + */ +public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { + + private final Collator collator; + + @Inject + public IcuCollationTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment environment, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + + Collator collator; + String rules = settings.get("rules"); + if (rules != null) { + FailedToResolveConfigException failureToResolve = null; + try { + rules = environment.resolveConfigAndLoadToString(rules); + } catch (FailedToResolveConfigException e) { + failureToResolve = e; + } catch (IOException e) { + throw new ElasticSearchIllegalArgumentException("Failed to load collation rules", e); + } + try { + collator = new RuleBasedCollator(rules); + } catch (Exception e) { + if (failureToResolve != null) { + throw new ElasticSearchIllegalArgumentException("Failed to resolve collation rules location", failureToResolve); + } else { + throw new ElasticSearchIllegalArgumentException("Failed to parse collation rules", e); + } + } + } else { + String language = settings.get("language"); + if (language != null) { + Locale locale; + String country = settings.get("country"); + if (country != null) { + String variant = settings.get("variant"); + if (variant != null) { + locale = new Locale(language, country, variant); + } else { + locale = new Locale(language, country); + } + } else { + locale = new Locale(language); + } + collator = Collator.getInstance(locale); + } else { + collator = Collator.getInstance(); + } + } + this.collator = collator; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ICUCollationKeyFilter(tokenStream, collator); + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java new file mode 100644 index 00000000000..ed11a227dcf --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.icu.ICUFoldingFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + + +/** + * + */ +public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory { + + @Inject + public IcuFoldingTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ICUFoldingFilter(tokenStream); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java new file mode 100644 index 00000000000..b28e7c92122 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java @@ -0,0 +1,52 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import com.ibm.icu.text.Normalizer2; +import org.apache.lucene.analysis.TokenStream; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + + +/** + * Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to normalize tokens. + *

+ *

The name can be used to provide the type of normalization to perform. + * + * + */ +public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory { + + private final String name; + + @Inject + public IcuNormalizerTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + this.name = settings.get("name", "nfkc_cf"); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE)); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java new file mode 100644 index 00000000000..f24852db856 --- /dev/null +++ b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java @@ -0,0 +1,49 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.plugin.analysis.icu; + +import org.elasticsearch.common.inject.Module; +import org.elasticsearch.index.analysis.AnalysisModule; +import org.elasticsearch.index.analysis.IcuAnalysisBinderProcessor; +import org.elasticsearch.plugins.AbstractPlugin; + +/** + * + */ +public class AnalysisICUPlugin extends AbstractPlugin { + + @Override + public String name() { + return "analysis-icu"; + } + + @Override + public String description() { + return "UTF related ICU analysis support"; + } + + @Override + public void processModule(Module module) { + if (module instanceof AnalysisModule) { + AnalysisModule analysisModule = (AnalysisModule) module; + analysisModule.addProcessor(new IcuAnalysisBinderProcessor()); + } + } +} diff --git a/src/main/resources/es-plugin.properties b/src/main/resources/es-plugin.properties new file mode 100644 index 00000000000..b694c79bec0 --- /dev/null +++ b/src/main/resources/es-plugin.properties @@ -0,0 +1 @@ +plugin=org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java new file mode 100644 index 00000000000..adcd03a645b --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -0,0 +1,59 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.hamcrest.MatcherAssert; +import org.testng.annotations.Test; + +import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS; +import static org.hamcrest.Matchers.instanceOf; + +/** + * + */ +public class SimpleIcuAnalysisTests { + + @Test + public void testDefaultsIcuAnalysis() { + Index index = new Index("test"); + + Injector parentInjector = new ModulesBuilder().add(new SettingsModule(EMPTY_SETTINGS), new EnvironmentModule(new Environment(EMPTY_SETTINGS)), new IndicesAnalysisModule()).createInjector(); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(index, EMPTY_SETTINGS), + new IndexNameModule(index), + new AnalysisModule(EMPTY_SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor())) + .createChildInjector(parentInjector); + + AnalysisService analysisService = injector.getInstance(AnalysisService.class); + + TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer"); + MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class)); + } +} diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties new file mode 100644 index 00000000000..497c97f9959 --- /dev/null +++ b/src/test/resources/log4j.properties @@ -0,0 +1,5 @@ +log4j.rootLogger=INFO, out + +log4j.appender.out=org.apache.log4j.ConsoleAppender +log4j.appender.out.layout=org.apache.log4j.PatternLayout +log4j.appender.out.layout.conversionPattern=[%d{ISO8601}][%-5p][%-25c] %m%n From 271729e31588f3a64319b50bd36d56829285e47d Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Mon, 5 Dec 2011 13:35:54 +0200 Subject: [PATCH 002/131] update to 1.1.0 snap --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index da5bdfd0d97..1958c4fdbe2 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.0.0 + 1.1.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From 2bf12bbd90862cc14b6871264ad1b0976af81e6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Prante?= Date: Sun, 11 Dec 2011 00:40:31 +0100 Subject: [PATCH 003/131] Update README.md --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 50bb848c982..ce43c102d6f 100644 --- a/README.md +++ b/README.md @@ -5,11 +5,11 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.0.0`. - --------------------------------------- - | memcached Plugin | ElasticSearch | - --------------------------------------- - | master | 0.18 -> master | - --------------------------------------- - | 1.0.0 | 0.18 -> master | - --------------------------------------- + ---------------------------------------- + | ICU Analysis Plugin | ElasticSearch | + ---------------------------------------- + | master | 0.18 -> master | + ---------------------------------------- + | 1.0.0 | 0.18 -> master | + ---------------------------------------- From 3f9c893ec61725dd5cab788080f14331e2643523 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Prante?= Date: Sun, 11 Dec 2011 02:20:42 +0100 Subject: [PATCH 004/131] added ICU tokenizer ; ICU transform by transliterator --- .../index/analysis/IcuTokenizerFactory.java | 45 ++++++++++++++++ .../IcuTransformTokenFilterFactory.java | 52 +++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java new file mode 100644 index 00000000000..736ba204046 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import java.io.Reader; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * + * @author joerg + */ +public class IcuTokenizerFactory extends AbstractTokenizerFactory { + + @Inject public IcuTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public Tokenizer create(Reader reader) { + return new ICUTokenizer(reader); + } + +} diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java new file mode 100644 index 00000000000..1f34d02b3bd --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java @@ -0,0 +1,52 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.icu.ICUTransformFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import com.ibm.icu.text.Transliterator; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + + +/** + * @author joergprante + */ +public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory { + + private final String id; + private final int dir; + private final Transliterator transliterator; + + @Inject public IcuTransformTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + this.id = settings.get("id", "Null"); + String s = settings.get("dir", "forward"); + this.dir = "forward".equals(s) ? Transliterator.FORWARD : Transliterator.REVERSE; + this.transliterator = Transliterator.getInstance(id, dir); + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new ICUTransformFilter(tokenStream, transliterator); + } +} \ No newline at end of file From 188944be77b63d3e7016182668998c5a120a5b25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Prante?= Date: Sun, 11 Dec 2011 02:24:03 +0100 Subject: [PATCH 005/131] added ICU tokenizer and ICU transform --- .../analysis/IcuAnalysisBinderProcessor.java | 17 +++++++++---- .../analysis/SimpleIcuAnalysisTests.java | 25 +++++++++++++------ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java index 111607cbea6..dd4413a580f 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java @@ -1,8 +1,8 @@ /* - * Licensed to ElasticSearch and Shay Banon under one + * Licensed to Elastic Search and Shay Banon under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this + * regarding copyright ownership. Elastic Search licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at @@ -20,12 +20,16 @@ package org.elasticsearch.index.analysis; /** - * + * @author kimchy (shay.banon) */ public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { - @Override - public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { + @Override public void processTokenizers(TokenizersBindings tokenizersBindings) { + tokenizersBindings.processTokenizer("icuTokenizer", IcuTokenizerFactory.class); + tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class); + } + + @Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { tokenFiltersBindings.processTokenFilter("icuNormalizer", IcuNormalizerTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("icu_normalizer", IcuNormalizerTokenFilterFactory.class); @@ -34,5 +38,8 @@ public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderPro tokenFiltersBindings.processTokenFilter("icuCollation", IcuCollationTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("icu_collation", IcuCollationTokenFilterFactory.class); + + tokenFiltersBindings.processTokenFilter("icuTransform", IcuTransformTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("icu_transform", IcuTransformTokenFilterFactory.class); } } diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java index adcd03a645b..f28f13184dd 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -1,8 +1,8 @@ /* - * Licensed to ElasticSearch and Shay Banon under one + * Licensed to Elastic Search and Shay Banon under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this + * regarding copyright ownership. Elastic Search licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at @@ -32,16 +32,15 @@ import org.elasticsearch.indices.analysis.IndicesAnalysisService; import org.hamcrest.MatcherAssert; import org.testng.annotations.Test; -import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS; -import static org.hamcrest.Matchers.instanceOf; +import static org.elasticsearch.common.settings.ImmutableSettings.Builder.*; +import static org.hamcrest.Matchers.*; /** - * + * @author kimchy (shay.banon) */ public class SimpleIcuAnalysisTests { - @Test - public void testDefaultsIcuAnalysis() { + @Test public void testDefaultsIcuAnalysis() { Index index = new Index("test"); Injector parentInjector = new ModulesBuilder().add(new SettingsModule(EMPTY_SETTINGS), new EnvironmentModule(new Environment(EMPTY_SETTINGS)), new IndicesAnalysisModule()).createInjector(); @@ -53,7 +52,19 @@ public class SimpleIcuAnalysisTests { AnalysisService analysisService = injector.getInstance(AnalysisService.class); + TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer"); + MatcherAssert.assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class)); + TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer"); MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class)); + + filterFactory = analysisService.tokenFilter("icu_folding"); + MatcherAssert.assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class)); + + filterFactory = analysisService.tokenFilter("icu_collation"); + MatcherAssert.assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class)); + + filterFactory = analysisService.tokenFilter("icu_transform"); + MatcherAssert.assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class)); } } From 1901efb64ae6605590cf0ecadd4ee7eb69d6efa5 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Sun, 11 Dec 2011 18:59:12 +0200 Subject: [PATCH 006/131] remove author names (we no longer have them on) and fix header --- .../analysis/IcuAnalysisBinderProcessor.java | 13 ++++++------ .../index/analysis/IcuTokenizerFactory.java | 20 +++++++++---------- .../IcuTransformTokenFilterFactory.java | 14 +++++++------ .../analysis/SimpleIcuAnalysisTests.java | 18 ++++++++--------- 4 files changed, 34 insertions(+), 31 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java index dd4413a580f..c1d52232fe8 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java @@ -1,8 +1,8 @@ /* - * Licensed to Elastic Search and Shay Banon under one + * Licensed to ElasticSearch and Shay Banon under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. Elastic Search licenses this + * regarding copyright ownership. ElasticSearch licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at @@ -20,16 +20,17 @@ package org.elasticsearch.index.analysis; /** - * @author kimchy (shay.banon) */ public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { - @Override public void processTokenizers(TokenizersBindings tokenizersBindings) { + @Override + public void processTokenizers(TokenizersBindings tokenizersBindings) { tokenizersBindings.processTokenizer("icuTokenizer", IcuTokenizerFactory.class); tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class); } - - @Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { + + @Override + public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { tokenFiltersBindings.processTokenFilter("icuNormalizer", IcuNormalizerTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("icu_normalizer", IcuNormalizerTokenFilterFactory.class); diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java index 736ba204046..440fbb4084e 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java @@ -1,8 +1,8 @@ /* - * Licensed to Elastic Search and Shay Banon under one + * Licensed to ElasticSearch and Shay Banon under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. Elastic Search licenses this + * regarding copyright ownership. ElasticSearch licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at @@ -18,28 +18,28 @@ */ package org.elasticsearch.index.analysis; -import org.elasticsearch.common.inject.Inject; -import org.elasticsearch.common.inject.assistedinject.Assisted; -import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.Index; import org.elasticsearch.index.settings.IndexSettings; +import java.io.Reader; + /** - * - * @author joerg */ public class IcuTokenizerFactory extends AbstractTokenizerFactory { - @Inject public IcuTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + @Inject + public IcuTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); } - + @Override public Tokenizer create(Reader reader) { return new ICUTokenizer(reader); } - + } diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java index 1f34d02b3bd..525fedac508 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java @@ -1,8 +1,8 @@ /* - * Licensed to Elastic Search and Shay Banon under one + * Licensed to ElasticSearch and Shay Banon under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. Elastic Search licenses this + * regarding copyright ownership. ElasticSearch licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at @@ -19,11 +19,11 @@ package org.elasticsearch.index.analysis; +import com.ibm.icu.text.Transliterator; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.icu.ICUTransformFilter; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; -import com.ibm.icu.text.Transliterator; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.Index; import org.elasticsearch.index.settings.IndexSettings; @@ -38,15 +38,17 @@ public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory { private final int dir; private final Transliterator transliterator; - @Inject public IcuTransformTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + @Inject + public IcuTransformTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); this.id = settings.get("id", "Null"); - String s = settings.get("dir", "forward"); + String s = settings.get("dir", "forward"); this.dir = "forward".equals(s) ? Transliterator.FORWARD : Transliterator.REVERSE; this.transliterator = Transliterator.getInstance(id, dir); } - @Override public TokenStream create(TokenStream tokenStream) { + @Override + public TokenStream create(TokenStream tokenStream) { return new ICUTransformFilter(tokenStream, transliterator); } } \ No newline at end of file diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java index f28f13184dd..a3e55a7a10e 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -1,8 +1,8 @@ /* - * Licensed to Elastic Search and Shay Banon under one + * Licensed to ElasticSearch and Shay Banon under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. Elastic Search licenses this + * regarding copyright ownership. ElasticSearch licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at @@ -32,15 +32,15 @@ import org.elasticsearch.indices.analysis.IndicesAnalysisService; import org.hamcrest.MatcherAssert; import org.testng.annotations.Test; -import static org.elasticsearch.common.settings.ImmutableSettings.Builder.*; -import static org.hamcrest.Matchers.*; +import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS; +import static org.hamcrest.Matchers.instanceOf; /** - * @author kimchy (shay.banon) */ public class SimpleIcuAnalysisTests { - @Test public void testDefaultsIcuAnalysis() { + @Test + public void testDefaultsIcuAnalysis() { Index index = new Index("test"); Injector parentInjector = new ModulesBuilder().add(new SettingsModule(EMPTY_SETTINGS), new EnvironmentModule(new Environment(EMPTY_SETTINGS)), new IndicesAnalysisModule()).createInjector(); @@ -54,16 +54,16 @@ public class SimpleIcuAnalysisTests { TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer"); MatcherAssert.assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class)); - + TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer"); MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class)); - + filterFactory = analysisService.tokenFilter("icu_folding"); MatcherAssert.assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class)); filterFactory = analysisService.tokenFilter("icu_collation"); MatcherAssert.assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class)); - + filterFactory = analysisService.tokenFilter("icu_transform"); MatcherAssert.assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class)); } From 7cb31acb4da429148e6b82eb070172812ece775f Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Tue, 13 Dec 2011 15:03:24 +0200 Subject: [PATCH 007/131] fix pom --- pom.xml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pom.xml b/pom.xml index 1958c4fdbe2..61067ae46e2 100644 --- a/pom.xml +++ b/pom.xml @@ -118,10 +118,19 @@ maven-assembly-plugin + ${project.build.directory}/releases/ ${basedir}/src/main/assemblies/plugin.xml + + + package + + single + + + From b61e27eccc73f389602aff4648c8c9a8406fc0c6 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Tue, 13 Dec 2011 15:27:47 +0200 Subject: [PATCH 008/131] release 1.1.0 --- README.md | 100 +++++++++++++++++- pom.xml | 2 +- .../IcuTransformTokenFilterFactory.java | 1 - 3 files changed, 100 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ce43c102d6f..993a644485b 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,111 @@ ICU Analysis for ElasticSearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.0.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.1.0`. ---------------------------------------- | ICU Analysis Plugin | ElasticSearch | ---------------------------------------- | master | 0.18 -> master | ---------------------------------------- + | 1.1.0 | 0.18 -> master | + ---------------------------------------- | 1.0.0 | 0.18 -> master | ---------------------------------------- + +ICU Normalization +----------------- + +Normalizes characters as explained "here":http://userguide.icu-project.org/transforms/normalization. It registers itself by default under @icu_normalizer@ or @icuNormalizer@ using the default settings. Allows for the name parameter to be provided which can include the following values: @nfc@, @nfkc@, and @nfkc_cf@. Here is a sample settings: + + { + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "filter" : ["icu_normalizer"] + } + } + } + } + } + +ICU Folding +----------- + +Folding of unicode characters based on @UTR#30@. It registers itself under @icu_folding@ and @icuFolding@ names. Sample setting: + + { + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "filter" : ["icu_folding"] + } + } + } + } + } + +ICU Collation +------------- + +Uses collation token filter. Allows to either specify the rules for collation (defined "here":http://www.icu-project.org/userguide/Collate_Customization.html) using the @rules@ parameter (can point to a location or expressed in the settings, location can be relative to config location), or using the @language@ parameter (further specialized by country and variant). By default registers under @icu_collation@ or @icuCollation@ and uses the default locale. + +Here is a sample settings: + + { + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "filter" : ["icu_collation"] + } + } + } + } + } + +And here is a sample of custom collation: + + { + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "filter" : ["myCollator"] + } + }, + "filter" : { + "myCollator" : { + "type" : "icu_collation", + "language" : "en" + } + } + } + } + } + + +ICU Tokenizer +------------- + +Breaks text into words according to UAX #29: Unicode Text Segmentation ((http://www.unicode.org/reports/tr29/)). + + { + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "icu_tokenizer", + } + } + } + } + } + diff --git a/pom.xml b/pom.xml index 61067ae46e2..e0f58cf65aa 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.1.0-SNAPSHOT + 1.1.0 jar ICU Analysis for ElasticSearch 2009 diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java index 525fedac508..17f727c446a 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java @@ -30,7 +30,6 @@ import org.elasticsearch.index.settings.IndexSettings; /** - * @author joergprante */ public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory { From 563bba5ed84bda833a3082bad318af95b53cbb49 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Tue, 13 Dec 2011 15:29:35 +0200 Subject: [PATCH 009/131] move to 1.2.0 snap --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index e0f58cf65aa..bbb385d4079 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.1.0 + 1.2.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From d750fa1a2f6c5bdd25ba486ce6a3a589cefb3d2e Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Thu, 29 Dec 2011 11:43:55 +0200 Subject: [PATCH 010/131] no need to have both camel case and underscore casing, we handle camelcase from underscore automatically. --- .../index/analysis/IcuAnalysisBinderProcessor.java | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java index c1d52232fe8..c032c45ec2c 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java @@ -25,22 +25,14 @@ public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderPro @Override public void processTokenizers(TokenizersBindings tokenizersBindings) { - tokenizersBindings.processTokenizer("icuTokenizer", IcuTokenizerFactory.class); tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class); } @Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { - tokenFiltersBindings.processTokenFilter("icuNormalizer", IcuNormalizerTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("icu_normalizer", IcuNormalizerTokenFilterFactory.class); - - tokenFiltersBindings.processTokenFilter("icuFolding", IcuFoldingTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("icu_folding", IcuFoldingTokenFilterFactory.class); - - tokenFiltersBindings.processTokenFilter("icuCollation", IcuCollationTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("icu_collation", IcuCollationTokenFilterFactory.class); - - tokenFiltersBindings.processTokenFilter("icuTransform", IcuTransformTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("icu_transform", IcuTransformTokenFilterFactory.class); } } From 1f89efc44254314016ecb8cc0a8e7362fd31f006 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Wed, 18 Jan 2012 22:33:44 +0200 Subject: [PATCH 011/131] register global analyzers with ICU --- .../indices/analysis/IcuIndicesAnalysis.java | 93 +++++++++++++++++++ .../analysis/IcuIndicesAnalysisModule.java | 32 +++++++ .../analysis/icu/AnalysisICUPlugin.java | 9 ++ 3 files changed, 134 insertions(+) create mode 100644 src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java create mode 100644 src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java diff --git a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java new file mode 100644 index 00000000000..21ed87f854f --- /dev/null +++ b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java @@ -0,0 +1,93 @@ +package org.elasticsearch.indices.analysis; + +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.Transliterator; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.icu.ICUFoldingFilter; +import org.apache.lucene.analysis.icu.ICUTransformFilter; +import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; +import org.apache.lucene.collation.ICUCollationKeyFilter; +import org.elasticsearch.common.component.AbstractComponent; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory; +import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; + +import java.io.Reader; + +/** + * Registers indices level analysis components so, if not explicitly configured, will be shared + * among all indices. + */ +public class IcuIndicesAnalysis extends AbstractComponent { + + @Inject + public IcuIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) { + super(settings); + + indicesAnalysisService.tokenizerFactories().put("icu_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override + public String name() { + return "icu_tokenizer"; + } + + @Override + public Tokenizer create(Reader reader) { + return new ICUTokenizer(reader); + } + })); + + indicesAnalysisService.tokenFilterFactories().put("icu_normalizer", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override + public String name() { + return "icu_normalizer"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); + } + })); + + + indicesAnalysisService.tokenFilterFactories().put("icu_folding", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override + public String name() { + return "icu_folding"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ICUFoldingFilter(tokenStream); + } + })); + + indicesAnalysisService.tokenFilterFactories().put("icu_collation", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override + public String name() { + return "icu_collation"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ICUCollationKeyFilter(tokenStream, Collator.getInstance()); + } + })); + + indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override + public String name() { + return "icu_transform"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ICUTransformFilter(tokenStream, Transliterator.getInstance("Null", Transliterator.FORWARD)); + } + })); + } +} diff --git a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java new file mode 100644 index 00000000000..5547df665a0 --- /dev/null +++ b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java @@ -0,0 +1,32 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.indices.analysis; + +import org.elasticsearch.common.inject.AbstractModule; + +/** + */ +public class IcuIndicesAnalysisModule extends AbstractModule { + + @Override + protected void configure() { + bind(IcuIndicesAnalysis.class).asEagerSingleton(); + } +} diff --git a/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java index f24852db856..c4d83661698 100644 --- a/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java +++ b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java @@ -19,11 +19,15 @@ package org.elasticsearch.plugin.analysis.icu; +import org.elasticsearch.common.collect.ImmutableList; import org.elasticsearch.common.inject.Module; import org.elasticsearch.index.analysis.AnalysisModule; import org.elasticsearch.index.analysis.IcuAnalysisBinderProcessor; +import org.elasticsearch.indices.analysis.IcuIndicesAnalysisModule; import org.elasticsearch.plugins.AbstractPlugin; +import java.util.Collection; + /** * */ @@ -39,6 +43,11 @@ public class AnalysisICUPlugin extends AbstractPlugin { return "UTF related ICU analysis support"; } + @Override + public Collection> modules() { + return ImmutableList.>of(IcuIndicesAnalysisModule.class); + } + @Override public void processModule(Module module) { if (module instanceof AnalysisModule) { From 90e80cf83c670b51ac3c37512a8a802abc85474b Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Tue, 31 Jan 2012 12:55:10 +0200 Subject: [PATCH 012/131] move to 0.19 snap and use some of its features --- README.md | 2 +- pom.xml | 2 +- .../plugin/analysis/icu/AnalysisICUPlugin.java | 11 +++++------ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 993a644485b..403c9a5257a 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ In order to install the plugin, simply run: `bin/plugin -install elasticsearch/e ---------------------------------------- | ICU Analysis Plugin | ElasticSearch | ---------------------------------------- - | master | 0.18 -> master | + | master | master (0.19) | ---------------------------------------- | 1.1.0 | 0.18 -> master | ---------------------------------------- diff --git a/pom.xml b/pom.xml index bbb385d4079..33833a97731 100644 --- a/pom.xml +++ b/pom.xml @@ -31,7 +31,7 @@ - 0.18.5 + 0.19.0-SNAPSHOT diff --git a/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java index c4d83661698..975bcf14b5d 100644 --- a/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java +++ b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java @@ -48,11 +48,10 @@ public class AnalysisICUPlugin extends AbstractPlugin { return ImmutableList.>of(IcuIndicesAnalysisModule.class); } - @Override - public void processModule(Module module) { - if (module instanceof AnalysisModule) { - AnalysisModule analysisModule = (AnalysisModule) module; - analysisModule.addProcessor(new IcuAnalysisBinderProcessor()); - } + /** + * Automatically called with the analysis module. + */ + public void onModule(AnalysisModule module) { + module.addProcessor(new IcuAnalysisBinderProcessor()); } } From 76ebae7daa2511971c3461f773f46326d1107500 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Tue, 7 Feb 2012 15:23:36 +0200 Subject: [PATCH 013/131] release 1.2.0 which works with 0.19 --- README.md | 6 ++++-- pom.xml | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 403c9a5257a..a92df240ef4 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,14 @@ ICU Analysis for ElasticSearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.1.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.2.0`. ---------------------------------------- | ICU Analysis Plugin | ElasticSearch | ---------------------------------------- - | master | master (0.19) | + | master | 0.19 -> master | + ---------------------------------------- + | 1.2.0 | 0.19 -> master | ---------------------------------------- | 1.1.0 | 0.18 -> master | ---------------------------------------- diff --git a/pom.xml b/pom.xml index 33833a97731..f8f27b2f7da 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.2.0-SNAPSHOT + 1.2.0 jar ICU Analysis for ElasticSearch 2009 @@ -31,7 +31,7 @@ - 0.19.0-SNAPSHOT + 0.19.0.RC1 From 92802ddb521f50560a4f4a020b5b368b255c0eaa Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Tue, 7 Feb 2012 15:24:39 +0200 Subject: [PATCH 014/131] move to 1.3.0 snap --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index f8f27b2f7da..875d078982a 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.2.0 + 1.3.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From bc2bfb7aa1c77bc4b89e9278a036308607ba9683 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Tue, 7 Feb 2012 15:46:26 +0200 Subject: [PATCH 015/131] fix readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a92df240ef4..2da73e240f4 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,9 @@ In order to install the plugin, simply run: `bin/plugin -install elasticsearch/e ---------------------------------------- | 1.2.0 | 0.19 -> master | ---------------------------------------- - | 1.1.0 | 0.18 -> master | + | 1.1.0 | 0.18 | ---------------------------------------- - | 1.0.0 | 0.18 -> master | + | 1.0.0 | 0.18 | ---------------------------------------- From dfba64a8a0c1ebd1ee517ec1a5fdf00abd5c575e Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Sun, 26 Feb 2012 10:08:32 +0200 Subject: [PATCH 016/131] upgrade to latest assembly plugin --- pom.xml | 2 ++ src/main/assemblies/plugin.xml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 875d078982a..aa6aeea9b6c 100644 --- a/pom.xml +++ b/pom.xml @@ -117,7 +117,9 @@ maven-assembly-plugin + 2.2.2 + false ${project.build.directory}/releases/ ${basedir}/src/main/assemblies/plugin.xml diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml index c5c3a71957b..90a8ce57130 100644 --- a/src/main/assemblies/plugin.xml +++ b/src/main/assemblies/plugin.xml @@ -1,6 +1,6 @@ - + plugin zip From 1839d2fe260996acf457c19d413682b1b5bcb160 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Sun, 26 Feb 2012 10:10:31 +0200 Subject: [PATCH 017/131] upgrade to latest elasticsearch version --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index aa6aeea9b6c..ec34b5b6aaa 100644 --- a/pom.xml +++ b/pom.xml @@ -31,7 +31,7 @@ - 0.19.0.RC1 + 0.19.0.RC3 From 9c173a7f7fb4eae7db467a839480608d205058ff Mon Sep 17 00:00:00 2001 From: David Pilato Date: Sun, 26 Feb 2012 23:26:43 +0100 Subject: [PATCH 018/131] Update maven assembly plugin to latest version : 2.3 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index ec34b5b6aaa..edfc99132ed 100644 --- a/pom.xml +++ b/pom.xml @@ -117,7 +117,7 @@ maven-assembly-plugin - 2.2.2 + 2.3 false ${project.build.directory}/releases/ From 1dff19089e6896a9f9d9175d5f6cafed4c327060 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Sun, 26 Feb 2012 23:28:09 +0100 Subject: [PATCH 019/131] Ignore eclipse files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 06a1e6fedb6..3916fcee60b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ /target .DS_Store *.iml +/.settings +/.classpath +/.project From 96e38eca6183646c77d4fde99eec457b22e5df4b Mon Sep 17 00:00:00 2001 From: barsk Date: Tue, 28 Feb 2012 13:57:54 +0100 Subject: [PATCH 020/131] Exposing filtering capabilities via the unicodeSetFilter attribute --- .../IcuFoldingTokenFilterFactory.java | 41 +++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java index ed11a227dcf..f543654dbc3 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java @@ -1,8 +1,8 @@ /* - * Licensed to ElasticSearch and Shay Banon under one + * Licensed to Elastic Search and Shay Banon under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this + * regarding copyright ownership. Elastic Search licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at @@ -27,19 +27,46 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.Index; import org.elasticsearch.index.settings.IndexSettings; +import com.ibm.icu.text.FilteredNormalizer2; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.UnicodeSet; + /** + * Uses the {@link org.apache.lucene.analysis.icu.ICUFoldingFilter}. + * Applies foldings from UTR#30 Character Foldings. + *

+ * Can be filtered to handle certain characters in a specified way (see http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html) + * E.g national chars that should be retained (filter : "[^åäöÅÄÖ]"). * + *

The unicodeSetFilter attribute can be used to provide the UniCodeSet for filtering. + * + * @author kimchy (shay.banon) */ public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory { + private final String unicodeSetFilter; - @Inject - public IcuFoldingTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + @Inject public IcuFoldingTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); + this.unicodeSetFilter = settings.get("unicodeSetFilter"); } - @Override - public TokenStream create(TokenStream tokenStream) { - return new ICUFoldingFilter(tokenStream); + @Override public TokenStream create(TokenStream tokenStream) { + + // The ICUFoldingFilter is in fact implemented as a ICUNormalizer2Filter. + // ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here + if (unicodeSetFilter != null) { + Normalizer2 base = Normalizer2.getInstance( + ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), + "utr30", Normalizer2.Mode.COMPOSE); + UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter); + + unicodeSet.freeze(); + Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet); + return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered); + } + else { + return new ICUFoldingFilter(tokenStream); + } } } \ No newline at end of file From fa8664e4b276be89eae2eab691090cda70eddde7 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Tue, 20 Mar 2012 12:29:50 +0200 Subject: [PATCH 021/131] release 1.3.0 --- README.md | 2 ++ pom.xml | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2da73e240f4..d48ab364905 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ In order to install the plugin, simply run: `bin/plugin -install elasticsearch/e ---------------------------------------- | master | 0.19 -> master | ---------------------------------------- + | 1.3.0 | 0.19 -> master | + ---------------------------------------- | 1.2.0 | 0.19 -> master | ---------------------------------------- | 1.1.0 | 0.18 | diff --git a/pom.xml b/pom.xml index ec34b5b6aaa..9ff040ac328 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.3.0-SNAPSHOT + 1.3.0 jar ICU Analysis for ElasticSearch 2009 @@ -31,7 +31,7 @@ - 0.19.0.RC3 + 0.19.0 From 52f59689fa0e479feed649d17da9ff96638e4b41 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Tue, 20 Mar 2012 12:37:30 +0200 Subject: [PATCH 022/131] release 1.4.0 --- README.md | 2 ++ pom.xml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d48ab364905..3b9213b6b37 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ In order to install the plugin, simply run: `bin/plugin -install elasticsearch/e ---------------------------------------- | master | 0.19 -> master | ---------------------------------------- + | 1.4.0 | 0.19 -> master | + ---------------------------------------- | 1.3.0 | 0.19 -> master | ---------------------------------------- | 1.2.0 | 0.19 -> master | diff --git a/pom.xml b/pom.xml index 846ce308114..e165b0e9af3 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.3.0 + 1.4.0 jar ICU Analysis for ElasticSearch 2009 From 8aabd24584a065c651c0cef850bc07e2613ba68a Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Tue, 20 Mar 2012 12:37:53 +0200 Subject: [PATCH 023/131] move to 1.5.0 snap --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index e165b0e9af3..10c8e532137 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.4.0 + 1.5.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From 0e776422925f3cf65a3b9f7c15b0a39c1448ae32 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Mon, 30 Apr 2012 12:35:24 +0300 Subject: [PATCH 024/131] move to lucene 3.6 --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 10c8e532137..19cfeee7be5 100644 --- a/pom.xml +++ b/pom.xml @@ -31,7 +31,7 @@ - 0.19.0 + 0.19.3 @@ -48,7 +48,7 @@ org.apache.lucene lucene-icu - 3.5.0 + 3.6.0 compile From 05109d3a047193c319c538c740615533bedcf9ae Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Mon, 30 Apr 2012 12:41:07 +0300 Subject: [PATCH 025/131] release 1.5.0 --- README.md | 4 +++- pom.xml | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3b9213b6b37..3c3d18d9a53 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,15 @@ ICU Analysis for ElasticSearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.2.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.5.0`. ---------------------------------------- | ICU Analysis Plugin | ElasticSearch | ---------------------------------------- | master | 0.19 -> master | ---------------------------------------- + | 1.5.0 | 0.19 -> master | + ---------------------------------------- | 1.4.0 | 0.19 -> master | ---------------------------------------- | 1.3.0 | 0.19 -> master | diff --git a/pom.xml b/pom.xml index 19cfeee7be5..0f18a1dd483 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.5.0-SNAPSHOT + 1.5.0 jar ICU Analysis for ElasticSearch 2009 From 2db3f03496d4d04813258f6f3b0c17e2378e8d62 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Mon, 30 Apr 2012 12:41:25 +0300 Subject: [PATCH 026/131] move to 1.6.0 snap --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 0f18a1dd483..cd852ebcaa9 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.5.0 + 1.6.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From f365255030bbe1ab7038b0c764865310103ff2d1 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Sun, 10 Jun 2012 21:55:08 +0200 Subject: [PATCH 027/131] add license and repo --- LICENSE.txt | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 19 +++++ pom.xml | 4 ++ 3 files changed, 225 insertions(+) create mode 100644 LICENSE.txt diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 00000000000..d6456956733 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 3c3d18d9a53..609e4847251 100644 --- a/README.md +++ b/README.md @@ -119,3 +119,22 @@ Breaks text into words according to UAX #29: Unicode Text Segmentation ((http:// } } + +License +------- + + This software is licensed under the Apache 2 license, quoted below. + + Copyright 2009-2011 Shay Banon and ElasticSearch + + Licensed under the Apache License, Version 2.0 (the "License"); you may not + use this file except in compliance with the License. You may obtain a copy of + the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + License for the specific language governing permissions and limitations under + the License. diff --git a/pom.xml b/pom.xml index cd852ebcaa9..6818041d99b 100644 --- a/pom.xml +++ b/pom.xml @@ -35,6 +35,10 @@ + + sonatype + http://oss.sonatype.org/content/repositories/releases/ + From e0aad9bb834caabe4ffc82b1829f22cf66cdad1b Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Sun, 10 Jun 2012 21:55:47 +0200 Subject: [PATCH 028/131] fix date --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 609e4847251..6d40fc79f26 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ License This software is licensed under the Apache 2 license, quoted below. - Copyright 2009-2011 Shay Banon and ElasticSearch + Copyright 2009-2012 Shay Banon and ElasticSearch Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of From 32e0bcff22135865f00ad1471935f64456cd5ce2 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 27 Sep 2012 10:16:05 +0200 Subject: [PATCH 029/131] Replaced usage of java.util.Locale with com.ibm.icu.ULocale Closes #2290 --- .../index/analysis/IcuCollationTokenFilterFactory.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java index 3517c01b695..6c98d767022 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis; import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.util.ULocale; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.collation.ICUCollationKeyFilter; import org.elasticsearch.ElasticSearchIllegalArgumentException; @@ -33,7 +34,6 @@ import org.elasticsearch.index.Index; import org.elasticsearch.index.settings.IndexSettings; import java.io.IOException; -import java.util.Locale; /** * An ICU based collation token filter. There are two ways to configure collation: @@ -79,17 +79,17 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { } else { String language = settings.get("language"); if (language != null) { - Locale locale; + ULocale locale; String country = settings.get("country"); if (country != null) { String variant = settings.get("variant"); if (variant != null) { - locale = new Locale(language, country, variant); + locale = new ULocale(language, country, variant); } else { - locale = new Locale(language, country); + locale = new ULocale(language, country); } } else { - locale = new Locale(language); + locale = new ULocale(language); } collator = Collator.getInstance(locale); } else { From e1ca6182c704a36d3b14c4315a776f74d5ffc4be Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 27 Sep 2012 10:23:19 +0200 Subject: [PATCH 030/131] Updated testng, hamcrest, lucene-icu and surefire pluging versions --- pom.xml | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/pom.xml b/pom.xml index 6818041d99b..d44b7788000 100644 --- a/pom.xml +++ b/pom.xml @@ -52,37 +52,31 @@ org.apache.lucene lucene-icu - 3.6.0 + 3.6.1 compile log4j log4j - 1.2.16 + 1.2.17 runtime org.testng testng - 6.3.1 + 6.8 test org.hamcrest - hamcrest-core - 1.3.RC2 + hamcrest-all + 1.3 test - - org.hamcrest - hamcrest-library - 1.3.RC2 - test - @@ -99,7 +93,7 @@ org.apache.maven.plugins maven-surefire-plugin - 2.11 + 2.12.3 **/*Tests.java From 887877ece5355f2fb092805042924b2bcd81ff61 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 27 Sep 2012 10:24:50 +0200 Subject: [PATCH 031/131] Prepare 1.6.0 release --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index d44b7788000..123968a33ea 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.6.0-SNAPSHOT + 1.6.0 jar ICU Analysis for ElasticSearch 2009 From 14586143a64f8e1da8b5c584b47758dd253655d1 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 27 Sep 2012 10:36:43 +0200 Subject: [PATCH 032/131] Set next development version --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 123968a33ea..6265c581d1f 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.6.0 + 1.7.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From e7d045ed8128303b4204e8fba204240aa9c9f73f Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 27 Sep 2012 10:41:39 +0200 Subject: [PATCH 033/131] Updated README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6d40fc79f26..de8dd59d8f7 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,15 @@ ICU Analysis for ElasticSearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.5.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.6.0`. ---------------------------------------- | ICU Analysis Plugin | ElasticSearch | ---------------------------------------- | master | 0.19 -> master | ---------------------------------------- + | 1.6.0 | 0.19 -> master | + ---------------------------------------- | 1.5.0 | 0.19 -> master | ---------------------------------------- | 1.4.0 | 0.19 -> master | From 59d7f5cc14dd68166c512098e5ee1cea490191c9 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 28 Sep 2012 09:58:57 +0200 Subject: [PATCH 034/131] Exposed ICU collator options in IcuCollationTokenFilterFactory Closes #6 --- README.md | 25 ++ pom.xml | 10 + .../IcuCollationTokenFilterFactory.java | 77 ++++- .../analysis/SimpleIcuAnalysisTests.java | 12 +- .../SimpleIcuCollationTokenFilterTests.java | 300 ++++++++++++++++++ 5 files changed, 416 insertions(+), 8 deletions(-) create mode 100644 src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java diff --git a/README.md b/README.md index de8dd59d8f7..f29c1d5f00f 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,31 @@ And here is a sample of custom collation: } } +Optional options: +* `strength` - The strength property determines the minimum level of difference considered significant during comparison. + The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator. + Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`. + See ICU Collation:http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html documentation for a more detailed + explanation for the specific values. +* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with +`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were +normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form +before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between +faster and more complete collation behavior. Since a great many of the world's languages do not require text +normalization, most locales set `no` as the default decomposition mode. + +Expert options: +* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary` + to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace. +* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When + strength is set to `primary` this will ignore accent differences. +* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored + for strength `tertiary`. +* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For + example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`. +* `variableTop` - Single character or contraction. Controls what is variable for `alternate`. +* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana + and Hiragana characters in `quaternary` strength . ICU Tokenizer ------------- diff --git a/pom.xml b/pom.xml index 6265c581d1f..f7881eaaf03 100644 --- a/pom.xml +++ b/pom.xml @@ -68,6 +68,16 @@ testng 6.8 test + + + org.hamcrest + hamcrest-core + + + junit + junit + + diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java index 6c98d767022..d756e978b64 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -45,8 +45,6 @@ import java.io.IOException; *

The second option is to specify collation rules as defined in the * Collation customization chapter in icu docs. The rules parameter can either embed the rules definition * in the settings or refer to an external location (preferable located under the config location, relative to it). - * - * */ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { @@ -96,6 +94,81 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { collator = Collator.getInstance(); } } + + // set the strength flag, otherwise it will be the default. + String strength = settings.get("strength"); + if (strength != null) { + if (strength.equalsIgnoreCase("primary")) { + collator.setStrength(Collator.PRIMARY); + } else if (strength.equalsIgnoreCase("secondary")) { + collator.setStrength(Collator.SECONDARY); + } else if (strength.equalsIgnoreCase("tertiary")) { + collator.setStrength(Collator.TERTIARY); + } else if (strength.equalsIgnoreCase("quaternary")) { + collator.setStrength(Collator.QUATERNARY); + } else if (strength.equalsIgnoreCase("identical")) { + collator.setStrength(Collator.IDENTICAL); + } else { + throw new ElasticSearchIllegalArgumentException("Invalid strength: " + strength); + } + } + + // set the decomposition flag, otherwise it will be the default. + String decomposition = settings.get("decomposition"); + if (decomposition != null) { + if (decomposition.equalsIgnoreCase("no")) { + collator.setDecomposition(Collator.NO_DECOMPOSITION); + } else if (decomposition.equalsIgnoreCase("canonical")) { + collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); + } else { + throw new ElasticSearchIllegalArgumentException("Invalid decomposition: " + decomposition); + } + } + + // expert options: concrete subclasses are always a RuleBasedCollator + RuleBasedCollator rbc = (RuleBasedCollator) collator; + String alternate = settings.get("alternate"); + if (alternate != null) { + if (alternate.equalsIgnoreCase("shifted")) { + rbc.setAlternateHandlingShifted(true); + } else if (alternate.equalsIgnoreCase("non-ignorable")) { + rbc.setAlternateHandlingShifted(false); + } else { + throw new ElasticSearchIllegalArgumentException("Invalid alternate: " + alternate); + } + } + + Boolean caseLevel = settings.getAsBoolean("caseLevel", null); + if (caseLevel != null) { + rbc.setCaseLevel(caseLevel); + } + + String caseFirst = settings.get("caseFirst"); + if (caseFirst != null) { + if (caseFirst.equalsIgnoreCase("lower")) { + rbc.setLowerCaseFirst(true); + } else if (caseFirst.equalsIgnoreCase("upper")) { + rbc.setUpperCaseFirst(true); + } else { + throw new ElasticSearchIllegalArgumentException("Invalid caseFirst: " + caseFirst); + } + } + + Boolean numeric = settings.getAsBoolean("numeric", null); + if (numeric != null) { + rbc.setNumericCollation(numeric); + } + + String variableTop = settings.get("variableTop"); + if (variableTop != null) { + rbc.setVariableTop(variableTop); + } + + Boolean hiraganaQuaternaryMode = settings.getAsBoolean("hiraganaQuaternaryMode", null); + if (hiraganaQuaternaryMode != null) { + rbc.setHiraganaQuaternary(hiraganaQuaternaryMode); + } + this.collator = collator; } diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java index a3e55a7a10e..43df9d270be 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -29,10 +29,10 @@ import org.elasticsearch.index.IndexNameModule; import org.elasticsearch.index.settings.IndexSettingsModule; import org.elasticsearch.indices.analysis.IndicesAnalysisModule; import org.elasticsearch.indices.analysis.IndicesAnalysisService; -import org.hamcrest.MatcherAssert; import org.testng.annotations.Test; import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS; +import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.instanceOf; /** @@ -53,18 +53,18 @@ public class SimpleIcuAnalysisTests { AnalysisService analysisService = injector.getInstance(AnalysisService.class); TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer"); - MatcherAssert.assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class)); + assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class)); TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer"); - MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class)); + assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class)); filterFactory = analysisService.tokenFilter("icu_folding"); - MatcherAssert.assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class)); + assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class)); filterFactory = analysisService.tokenFilter("icu_collation"); - MatcherAssert.assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class)); + assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class)); filterFactory = analysisService.tokenFilter("icu_transform"); - MatcherAssert.assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class)); + assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class)); } } diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java new file mode 100644 index 00000000000..5d0b60b9575 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java @@ -0,0 +1,300 @@ +package org.elasticsearch.index.analysis; + +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.util.ULocale; +import org.apache.lucene.analysis.KeywordTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.io.StringReader; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; + +// Tests borrowed from Solr's Icu collation key filter factory test. +public class SimpleIcuCollationTokenFilterTests { + + /* + * Turkish has some funny casing. + * This test shows how you can solve this kind of thing easily with collation. + * Instead of using LowerCaseFilter, use a turkish collator with primary strength. + * Then things will sort and match correctly. + */ + @Test + public void testBasicUsage() throws Exception { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "tr") + .put("index.analysis.filter.myCollator.strength", "primary") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String turkishUpperCase = "I WİLL USE TURKİSH CASING"; + String turkishLowerCase = "ı will use turkish casıng"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase))); + TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase))); + assertCollatesToSame(tsUpper, tsLower); + } + + /* + * Test usage of the decomposition option for unicode normalization. + */ + @Test + public void testNormalization() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "tr") + .put("index.analysis.filter.myCollator.strength", "primary") + .put("index.analysis.filter.myCollator.decomposition", "canonical") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING"; + String turkishLowerCase = "ı will use turkish casıng"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase))); + TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase))); + assertCollatesToSame(tsUpper, tsLower); + } + + /* + * Test secondary strength, for english case is not significant. + */ + @Test + public void testSecondaryStrength() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "secondary") + .put("index.analysis.filter.myCollator.decomposition", "no") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String upperCase = "TESTING"; + String lowerCase = "testing"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upperCase))); + TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lowerCase))); + assertCollatesToSame(tsUpper, tsLower); + } + + /* + * Setting alternate=shifted to shift whitespace, punctuation and symbols + * to quaternary level + */ + @Test + public void testIgnorePunctuation() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "primary") + .put("index.analysis.filter.myCollator.alternate", "shifted") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String withPunctuation = "foo-bar"; + String withoutPunctuation = "foo bar"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation))); + TokenStream tsWithoutPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withoutPunctuation))); + assertCollatesToSame(tsPunctuation, tsWithoutPunctuation); + } + + /* + * Setting alternate=shifted and variableTop to shift whitespace, but not + * punctuation or symbols, to quaternary level + */ + @Test + public void testIgnoreWhitespace() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "primary") + .put("index.analysis.filter.myCollator.alternate", "shifted") + .put("index.analysis.filter.myCollator.variableTop", " ") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String withSpace = "foo bar"; + String withoutSpace = "foobar"; + String withPunctuation = "foo-bar"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace))); + TokenStream tsWithoutSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withoutSpace))); + assertCollatesToSame(tsWithSpace, tsWithoutSpace); + // now assert that punctuation still matters: foo-bar < foo bar + tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace))); + TokenStream tsWithPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation))); + assertCollation(tsWithPunctuation, tsWithSpace, -1); + } + + /* + * Setting numeric to encode digits with numeric value, so that + * foobar-9 sorts before foobar-10 + */ + @Test + public void testNumerics() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.numeric", "true") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String nine = "foobar-9"; + String ten = "foobar-10"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsNine = filterFactory.create(new KeywordTokenizer(new StringReader(nine))); + TokenStream tsTen = filterFactory.create(new KeywordTokenizer(new StringReader(ten))); + assertCollation(tsNine, tsTen, -1); + } + + /* + * Setting caseLevel=true to create an additional case level between + * secondary and tertiary + */ + @Test + public void testIgnoreAccentsButNotCase() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "primary") + .put("index.analysis.filter.myCollator.caseLevel", "true") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String withAccents = "résumé"; + String withoutAccents = "resume"; + String withAccentsUpperCase = "Résumé"; + String withoutAccentsUpperCase = "Resume"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsWithAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withAccents))); + TokenStream tsWithoutAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents))); + assertCollatesToSame(tsWithAccents, tsWithoutAccents); + + TokenStream tsWithAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withAccentsUpperCase))); + TokenStream tsWithoutAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase))); + assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase); + + // now assert that case still matters: resume < Resume + TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents))); + TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase))); + assertCollation(tsLower, tsUpper, -1); + } + + /* + * Setting caseFirst=upper to cause uppercase strings to sort + * before lowercase ones. + */ + @Test + public void testUpperCaseFirst() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "tertiary") + .put("index.analysis.filter.myCollator.caseFirst", "upper") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String lower = "resume"; + String upper = "Resume"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lower))); + TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upper))); + assertCollation(tsUpper, tsLower, -1); + } + + /* + * For german, you might want oe to sort and match with o umlaut. + * This is not the default, but you can make a customized ruleset to do this. + * + * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior. + * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383 + */ + @Test + public void testCustomRules() throws Exception { + RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE")); + String DIN5007_2_tailorings = + "& ae , a\u0308 & AE , A\u0308"+ + "& oe , o\u0308 & OE , O\u0308"+ + "& ue , u\u0308 & UE , u\u0308"; + + RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); + String tailoredRules = tailoredCollator.getRules(); + + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.rules", tailoredRules) + .put("index.analysis.filter.myCollator.strength", "primary") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String germanUmlaut = "Töne"; + String germanOE = "Toene"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + TokenStream tsUmlaut = filterFactory.create(new KeywordTokenizer(new StringReader(germanUmlaut))); + TokenStream tsOE = filterFactory.create(new KeywordTokenizer(new StringReader(germanOE))); + assertCollatesToSame(tsUmlaut, tsOE); + } + + private AnalysisService createAnalysisService(Index index, Settings settings) { + Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(index, settings), + new IndexNameModule(index), + new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor())) + .createChildInjector(parentInjector); + + return injector.getInstance(AnalysisService.class); + } + + private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException { + assertCollation(stream1, stream2, 0); + } + + private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException { + CharTermAttribute term1 = stream1 + .addAttribute(CharTermAttribute.class); + CharTermAttribute term2 = stream2 + .addAttribute(CharTermAttribute.class); + assertThat(stream1.incrementToken(), equalTo(true)); + assertThat(stream2.incrementToken(), equalTo(true)); + assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison))); + assertThat(stream1.incrementToken(), equalTo(false)); + assertThat(stream2.incrementToken(), equalTo(false)); + } + +} From ede75695235929ca4675b8f67dbd4a3cdd4a0576 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 28 Sep 2012 10:45:25 +0200 Subject: [PATCH 035/131] Prepare 1.7.0 release --- README.md | 4 +++- pom.xml | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f29c1d5f00f..9f00df01092 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,15 @@ ICU Analysis for ElasticSearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.6.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.7.0`. ---------------------------------------- | ICU Analysis Plugin | ElasticSearch | ---------------------------------------- | master | 0.19 -> master | ---------------------------------------- + | 1.7.0 | 0.19 -> master | + ---------------------------------------- | 1.6.0 | 0.19 -> master | ---------------------------------------- | 1.5.0 | 0.19 -> master | diff --git a/pom.xml b/pom.xml index f7881eaaf03..8fe70076dfc 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.7.0-SNAPSHOT + 1.7.0 jar ICU Analysis for ElasticSearch 2009 From aee7ee6b516894be39ff02620a31e79516c47f24 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 28 Sep 2012 12:02:16 +0200 Subject: [PATCH 036/131] Set next development version --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8fe70076dfc..1f15a56b93b 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.7.0 + 1.8.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From c69d6d5932ca382bf8742c9cd3f192b8d14d00d0 Mon Sep 17 00:00:00 2001 From: Olivier Favre Date: Mon, 4 Feb 2013 11:54:15 +0100 Subject: [PATCH 037/131] Upgrade to Lucene 4.1 --- pom.xml | 8 ++++---- .../analysis/SimpleIcuCollationTokenFilterTests.java | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index 1f15a56b93b..2d2f88589ee 100644 --- a/pom.xml +++ b/pom.xml @@ -31,7 +31,7 @@ - 0.19.3 + 0.21.0.Beta1-SNAPSHOT @@ -51,8 +51,8 @@ org.apache.lucene - lucene-icu - 3.6.1 + lucene-analyzers-icu + 4.1.0 compile @@ -144,4 +144,4 @@ - \ No newline at end of file + diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java index 5d0b60b9575..e4475e853b9 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java @@ -3,7 +3,7 @@ package org.elasticsearch.index.analysis; import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; -import org.apache.lucene.analysis.KeywordTokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.elasticsearch.common.inject.Injector; From d5ef4de88577d4248c8b89fa2e449e5603b0888d Mon Sep 17 00:00:00 2001 From: David Pilato Date: Sun, 24 Feb 2013 22:18:15 +0100 Subject: [PATCH 038/131] Upgrade to Lucene 4.1 Update README Use lucene.version in pom Relative to #8 --- README.md | 14 +++++++------- pom.xml | 4 +++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 9f00df01092..b4062eb1f4d 100644 --- a/README.md +++ b/README.md @@ -8,19 +8,19 @@ In order to install the plugin, simply run: `bin/plugin -install elasticsearch/e ---------------------------------------- | ICU Analysis Plugin | ElasticSearch | ---------------------------------------- - | master | 0.19 -> master | + | master | 0.21 -> master | ---------------------------------------- - | 1.7.0 | 0.19 -> master | + | 1.7.0 | 0.19 -> 0.20 | ---------------------------------------- - | 1.6.0 | 0.19 -> master | + | 1.6.0 | 0.19 -> 0.20 | ---------------------------------------- - | 1.5.0 | 0.19 -> master | + | 1.5.0 | 0.19 -> 0.20 | ---------------------------------------- - | 1.4.0 | 0.19 -> master | + | 1.4.0 | 0.19 -> 0.20 | ---------------------------------------- - | 1.3.0 | 0.19 -> master | + | 1.3.0 | 0.19 -> 0.20 | ---------------------------------------- - | 1.2.0 | 0.19 -> master | + | 1.2.0 | 0.19 -> 0.20 | ---------------------------------------- | 1.1.0 | 0.18 | ---------------------------------------- diff --git a/pom.xml b/pom.xml index 2d2f88589ee..9c4c12585d1 100644 --- a/pom.xml +++ b/pom.xml @@ -32,6 +32,7 @@ 0.21.0.Beta1-SNAPSHOT + 4.1.0 @@ -52,7 +53,7 @@ org.apache.lucene lucene-analyzers-icu - 4.1.0 + ${lucene.version} compile @@ -145,3 +146,4 @@ + From 1e3f8cf20cf920b1e4259a5cbed2950e7b6f7163 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Tue, 26 Feb 2013 15:58:30 +0100 Subject: [PATCH 039/131] release 1.8 --- README.md | 6 ++++-- pom.xml | 4 ++-- src/main/assemblies/plugin.xml | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index b4062eb1f4d..943a09e3797 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,14 @@ ICU Analysis for ElasticSearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.7.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.8.0`. ---------------------------------------- | ICU Analysis Plugin | ElasticSearch | ---------------------------------------- - | master | 0.21 -> master | + | master | 0.90 -> master | + ---------------------------------------- + | 1.8.0 | 0.90 -> master | ---------------------------------------- | 1.7.0 | 0.19 -> 0.20 | ---------------------------------------- diff --git a/pom.xml b/pom.xml index 9c4c12585d1..3abe08d46be 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.8.0-SNAPSHOT + 1.8.0 jar ICU Analysis for ElasticSearch 2009 @@ -31,7 +31,7 @@ - 0.21.0.Beta1-SNAPSHOT + 0.90.0.Beta1 4.1.0 diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml index 90a8ce57130..99365f60c49 100644 --- a/src/main/assemblies/plugin.xml +++ b/src/main/assemblies/plugin.xml @@ -19,7 +19,7 @@ true true - org.apache.lucene:lucene-icu + org.apache.lucene:lucene-analyzers-icu From e7e3c7c0cc49e15707715b6e9739804010add17b Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Tue, 26 Feb 2013 15:58:48 +0100 Subject: [PATCH 040/131] move to 1.9 snap --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 3abe08d46be..5f3e3bc3b7e 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.8.0 + 1.9.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From 73435b6422a653c7873a8635f8ab4a1c05b23cd1 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Mon, 29 Apr 2013 21:47:25 +0200 Subject: [PATCH 041/131] move to Lucene 4.2.1 and ES 0.90 GA --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 5f3e3bc3b7e..2c2bdbaeb61 100644 --- a/pom.xml +++ b/pom.xml @@ -31,8 +31,8 @@ - 0.90.0.Beta1 - 4.1.0 + 0.90.0 + 4.2.1 From ca96f4a2dab88b11e19026d9ae7ba69d54bce13f Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Mon, 29 Apr 2013 21:50:12 +0200 Subject: [PATCH 042/131] release 1.9 --- README.md | 4 +++- pom.xml | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 943a09e3797..af6fca3ea2d 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,15 @@ ICU Analysis for ElasticSearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.8.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.9.0`. ---------------------------------------- | ICU Analysis Plugin | ElasticSearch | ---------------------------------------- | master | 0.90 -> master | ---------------------------------------- + | 1.9.0 | 0.90 -> master | + ---------------------------------------- | 1.8.0 | 0.90 -> master | ---------------------------------------- | 1.7.0 | 0.19 -> 0.20 | diff --git a/pom.xml b/pom.xml index 2c2bdbaeb61..01caf27759d 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.9.0-SNAPSHOT + 1.9.0 jar ICU Analysis for ElasticSearch 2009 From a4632040433ccba76e7c45f7a12ecdf8de451d79 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Mon, 29 Apr 2013 21:50:28 +0200 Subject: [PATCH 043/131] move to 1.10 snap --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 01caf27759d..89eb2290be9 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.9.0 + 1.10.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From 5f460ed734c2dc78c864a05b4d61c0d048efc004 Mon Sep 17 00:00:00 2001 From: Clinton Gormley Date: Thu, 9 May 2013 12:41:05 +0200 Subject: [PATCH 044/131] Changed textile markup to markdown in README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index af6fca3ea2d..c47bbe99eb2 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ In order to install the plugin, simply run: `bin/plugin -install elasticsearch/e ICU Normalization ----------------- -Normalizes characters as explained "here":http://userguide.icu-project.org/transforms/normalization. It registers itself by default under @icu_normalizer@ or @icuNormalizer@ using the default settings. Allows for the name parameter to be provided which can include the following values: @nfc@, @nfkc@, and @nfkc_cf@. Here is a sample settings: +Normalizes characters as explained "here":http://userguide.icu-project.org/transforms/normalization. It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings. Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`. Here is a sample settings: { "index" : { @@ -53,7 +53,7 @@ Normalizes characters as explained "here":http://userguide.icu-project.org/trans ICU Folding ----------- -Folding of unicode characters based on @UTR#30@. It registers itself under @icu_folding@ and @icuFolding@ names. Sample setting: +Folding of unicode characters based on `UTR#30`. It registers itself under `icu_folding` and `icuFolding` names. Sample setting: { "index" : { @@ -71,7 +71,7 @@ Folding of unicode characters based on @UTR#30@. It registers itself under @icu_ ICU Collation ------------- -Uses collation token filter. Allows to either specify the rules for collation (defined "here":http://www.icu-project.org/userguide/Collate_Customization.html) using the @rules@ parameter (can point to a location or expressed in the settings, location can be relative to config location), or using the @language@ parameter (further specialized by country and variant). By default registers under @icu_collation@ or @icuCollation@ and uses the default locale. +Uses collation token filter. Allows to either specify the rules for collation (defined "here":http://www.icu-project.org/userguide/Collate_Customization.html) using the `rules` parameter (can point to a location or expressed in the settings, location can be relative to config location), or using the `language` parameter (further specialized by country and variant). By default registers under `icu_collation` or `icuCollation` and uses the default locale. Here is a sample settings: From 82b95772a3de57a816067673f37fc583ccd5dc58 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 30 May 2013 21:54:31 +0200 Subject: [PATCH 045/131] Update to Elasticsearch 0.90.1 / Lucene 4.3.0 Closes #10. --- README.md | 52 ++++++++++++++++++++++++++-------------------------- pom.xml | 4 ++-- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index c47bbe99eb2..a7df9a4d5a3 100644 --- a/README.md +++ b/README.md @@ -5,31 +5,31 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.9.0`. - ---------------------------------------- - | ICU Analysis Plugin | ElasticSearch | - ---------------------------------------- - | master | 0.90 -> master | - ---------------------------------------- - | 1.9.0 | 0.90 -> master | - ---------------------------------------- - | 1.8.0 | 0.90 -> master | - ---------------------------------------- - | 1.7.0 | 0.19 -> 0.20 | - ---------------------------------------- - | 1.6.0 | 0.19 -> 0.20 | - ---------------------------------------- - | 1.5.0 | 0.19 -> 0.20 | - ---------------------------------------- - | 1.4.0 | 0.19 -> 0.20 | - ---------------------------------------- - | 1.3.0 | 0.19 -> 0.20 | - ---------------------------------------- - | 1.2.0 | 0.19 -> 0.20 | - ---------------------------------------- - | 1.1.0 | 0.18 | - ---------------------------------------- - | 1.0.0 | 0.18 | - ---------------------------------------- + ----------------------------------------------- + | ICU Analysis Plugin | ElasticSearch | + ----------------------------------------------- + | 1.10.0-SNAPSHOT (master) | 0.90.1 -> master | + ----------------------------------------------- + | 1.9.0 | 0.90.0 | + ----------------------------------------------- + | 1.8.0 | 0.90.0 | + ----------------------------------------------- + | 1.7.0 | 0.19 -> 0.20 | + ----------------------------------------------- + | 1.6.0 | 0.19 -> 0.20 | + ----------------------------------------------- + | 1.5.0 | 0.19 -> 0.20 | + ----------------------------------------------- + | 1.4.0 | 0.19 -> 0.20 | + ----------------------------------------------- + | 1.3.0 | 0.19 -> 0.20 | + ----------------------------------------------- + | 1.2.0 | 0.19 -> 0.20 | + ----------------------------------------------- + | 1.1.0 | 0.18 | + ----------------------------------------------- + | 1.0.0 | 0.18 | + ----------------------------------------------- ICU Normalization @@ -158,7 +158,7 @@ License This software is licensed under the Apache 2 license, quoted below. - Copyright 2009-2012 Shay Banon and ElasticSearch + Copyright 2009-2013 Shay Banon and ElasticSearch Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of diff --git a/pom.xml b/pom.xml index 89eb2290be9..96119b0149f 100644 --- a/pom.xml +++ b/pom.xml @@ -31,8 +31,8 @@ - 0.90.0 - 4.2.1 + 0.90.1 + 4.3.0 From 937e076c10605bc07e9fd2065ad2600961e41f8f Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 30 May 2013 21:56:34 +0200 Subject: [PATCH 046/131] prepare release elasticsearch-analysis-icu-1.10.0 --- README.md | 6 ++++-- pom.xml | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a7df9a4d5a3..e27fdd4acdb 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,14 @@ ICU Analysis for ElasticSearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.9.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.10.0`. ----------------------------------------------- | ICU Analysis Plugin | ElasticSearch | ----------------------------------------------- - | 1.10.0-SNAPSHOT (master) | 0.90.1 -> master | + | 1.11.0-SNAPSHOT (master) | 0.90.1 -> master | + ----------------------------------------------- + | 1.10.0 | 0.90.1 -> master | ----------------------------------------------- | 1.9.0 | 0.90.0 | ----------------------------------------------- diff --git a/pom.xml b/pom.xml index 96119b0149f..4c4ec9121af 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.10.0-SNAPSHOT + 1.10.0 jar ICU Analysis for ElasticSearch 2009 From ced9ce0affb84a6cc839f4843c1bb8500a83429c Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 30 May 2013 22:03:36 +0200 Subject: [PATCH 047/131] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 4c4ec9121af..9e486e63cfd 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.10.0 + 1.11.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From 0347fcd710dde181b8050770c9f0804c631ee3a1 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 8 Aug 2013 15:13:34 +0200 Subject: [PATCH 048/131] Update to Elasticsearch 0.90.3 / Lucene 4.4.0 Closes #11. --- README.md | 4 ++-- pom.xml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e27fdd4acdb..9c77a7f23bb 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,9 @@ In order to install the plugin, simply run: `bin/plugin -install elasticsearch/e ----------------------------------------------- | ICU Analysis Plugin | ElasticSearch | ----------------------------------------------- - | 1.11.0-SNAPSHOT (master) | 0.90.1 -> master | + | 1.11.0-SNAPSHOT (master) | 0.90.3 -> master | ----------------------------------------------- - | 1.10.0 | 0.90.1 -> master | + | 1.10.0 | 0.90.1 -> 0.90.2 | ----------------------------------------------- | 1.9.0 | 0.90.0 | ----------------------------------------------- diff --git a/pom.xml b/pom.xml index 9e486e63cfd..f0eb4d0ff12 100644 --- a/pom.xml +++ b/pom.xml @@ -31,8 +31,8 @@ - 0.90.1 - 4.3.0 + 0.90.3 + 4.4.0 From 80fd25efb78ca5823175c822139d91f4124d22ab Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 8 Aug 2013 15:14:58 +0200 Subject: [PATCH 049/131] prepare release elasticsearch-analysis-icu-1.11.0 --- README.md | 6 ++++-- pom.xml | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9c77a7f23bb..d0b764f4d6a 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,14 @@ ICU Analysis for ElasticSearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.10.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.11.0`. ----------------------------------------------- | ICU Analysis Plugin | ElasticSearch | ----------------------------------------------- - | 1.11.0-SNAPSHOT (master) | 0.90.3 -> master | + | 1.12.0-SNAPSHOT (master) | 0.90.3 -> master | + ----------------------------------------------- + | 1.11.0 | 0.90.3 -> master | ----------------------------------------------- | 1.10.0 | 0.90.1 -> 0.90.2 | ----------------------------------------------- diff --git a/pom.xml b/pom.xml index f0eb4d0ff12..3897d1f84fc 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.11.0-SNAPSHOT + 1.11.0 jar ICU Analysis for ElasticSearch 2009 From e99a2e4944e753319ef8d7a5e136f32a30e52a97 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 8 Aug 2013 15:23:57 +0200 Subject: [PATCH 050/131] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 3897d1f84fc..d30b0cdbdc4 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.11.0 + 1.12.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From 880c864000bda21250fc38f46326ae90b672fe97 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 21 Aug 2013 11:53:21 +0200 Subject: [PATCH 051/131] Create CONTRIBUTING.md --- CONTRIBUTING.md | 98 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000000..238e8c368f1 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,98 @@ +Contributing to elasticsearch +============================= + +Elasticsearch is an open source project and we love to receive contributions from our community — you! There are many ways to contribute, from writing tutorials or blog posts, improving the documentation, submitting bug reports and feature requests or writing code which can be incorporated into Elasticsearch itself. + +Bug reports +----------- + +If you think you have found a bug in Elasticsearch, first make sure that you are testing against the [latest version of Elasticsearch](http://www.elasticsearch.org/download/) - your issue may already have been fixed. If not, search our [issues list](https://github.com/elasticsearch/elasticsearch/issues) on GitHub in case a similar issue has already been opened. + +It is very helpful if you can prepare a reproduction of the bug. In other words, provide a small test case which we can run to confirm your bug. It makes it easier to find the problem and to fix it. Test cases should be provided as `curl` commands which we can copy and paste into a terminal to run it locally, for example: + +```sh +# delete the index +curl -XDELETE localhost:9200/test + +# insert a document +curl -XPUT localhost:9200/test/test/1 -d '{ + "title": "test document" +}' + +# this should return XXXX but instead returns YYY +curl .... +``` + +Provide as much information as you can. You may think that the problem lies with your query, when actually it depends on how your data is indexed. The easier it is for us to recreate your problem, the faster it is likely to be fixed. + +Feature requests +---------------- + +If you find yourself wishing for a feature that doesn't exist in Elasticsearch, you are probably not alone. There are bound to be others out there with similar needs. Many of the features that Elasticsearch has today have been added because our users saw the need. +Open an issue on our [issues list](https://github.com/elasticsearch/elasticsearch/issues) on GitHub which describes the feature you would like to see, why you need it, and how it should work. + +Contributing code and documentation changes +------------------------------------------- + +If you have a bugfix or new feature that you would like to contribute to Elasticsearch, please find or open an issue about it first. Talk about what you would like to do. It may be that somebody is already working on it, or that there are particular issues that you should know about before implementing the change. + +We enjoy working with contributors to get their code accepted. There are many approaches to fixing a problem and it is important to find the best approach before writing too much code. + +The process for contributing to any of the [Elasticsearch repositories](https://github.com/elasticsearch/) is similar. Details for individual projects can be found below. + +### Fork and clone the repository + +You will need to fork the main Elasticsearch code or documentation repository and clone it to your local machine. See +[github help page](https://help.github.com/articles/fork-a-repo) for help. + +Further instructions for specific projects are given below. + +### Submitting your changes + +Once your changes and tests are ready to submit for review: + +1. Test your changes +Run the test suite to make sure that nothing is broken. + +2. Sign the Contributor License Agreement +Please make sure you have signed our [Contributor License Agreement](http://www.elasticsearch.org/contributor-agreement/). We are not asking you to assign copyright to us, but to give us the right to distribute your code without restriction. We ask this of all contributors in order to assure our users of the origin and continuing existence of the code. You only need to sign the CLA once. + +3. Rebase your changes +Update your local repository with the most recent code from the main Elasticsearch repository, and rebase your branch on top of the latest master branch. We prefer your changes to be squashed into a single commit. + +4. Submit a pull request +Push your local changes to your forked copy of the repository and [submit a pull request](https://help.github.com/articles/using-pull-requests). In the pull request, describe what your changes do and mention the number of the issue where discussion has taken place, eg "Closes #123". + +Then sit back and wait. There will probably be discussion about the pull request and, if any changes are needed, we would love to work with you to get your pull request merged into Elasticsearch. + + +Contributing to the Elasticsearch plugin +---------------------------------------- + +**Repository:** [https://github.com/elasticsearch/elasticsearch-analysis-icu](https://github.com/elasticsearch/elasticsearch-analysis-icu) + +Make sure you have [Maven](http://maven.apache.org) installed, as Elasticsearch uses it as its build system. Integration with IntelliJ and Eclipse should work out of the box. Eclipse users can automatically configure their IDE by running `mvn eclipse:eclipse` and then importing the project into their workspace: `File > Import > Existing project into workspace`. + +Please follow these formatting guidelines: + +* Java indent is 4 spaces +* Line width is 140 characters +* The rest is left to Java coding standards +* Disable “auto-format on save” to prevent unnecessary format changes. This makes reviews much harder as it generates unnecessary formatting changes. If your IDE supports formatting only modified chunks that is fine to do. + +To create a distribution from the source, simply run: + +```sh +cd elasticsearch-analysis-icu/ +mvn clean package -DskipTests +``` + +You will find the newly built packages under: `./target/releases/`. + +Before submitting your changes, run the test suite to make sure that nothing is broken, with: + +```sh +mvn clean test +``` + +Source: [Contributing to elasticsearch](http://www.elasticsearch.org/contributing-to-elasticsearch/) From 059577863e189cb366b66a31d8d4a7c3954dad08 Mon Sep 17 00:00:00 2001 From: Clinton Gormley Date: Sat, 5 Oct 2013 16:29:40 +0200 Subject: [PATCH 052/131] Update README.md --- README.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/README.md b/README.md index d0b764f4d6a..c1abaf1c6ad 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,32 @@ Folding of unicode characters based on `UTR#30`. It registers itself under `icu_ } } +ICU Filtering +------------- + +The folding can be filtered by a set of unicode characters with the parameter `unicodeSetFilter`. This is useful for a non-internationalized search engine where retaining a set of national characters which are primary letters in a specific language is wanted. See syntax for the UnicodeSet "here":http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html. + +The Following example exempts Swedish characters from the folding. Note that the filtered characters are NOT lowercased which is why we add that filter below. + + { + "index" : { + "analysis" : { + "analyzer" : { + "folding" : { + "tokenizer" : "standard", + "filter" : ["my_icu_folding", "lowercase"] + } + } + "filter" : { + "my_icu_folding" : { + "type" : "icu_folding" + "unicodeSetFilter" : "[^åäöÅÄÖ]" + } + } + } + } + } + ICU Collation ------------- From c46e23e680aa8e99a0c4f2bb18c737c97475db18 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 6 Nov 2013 10:22:39 +0100 Subject: [PATCH 053/131] Update to Elasticsearch 0.90.6 / Lucene 4.5.1 Closes #12. --- README.md | 106 +++++++++++++++++++++++++++++++++++++++--------------- pom.xml | 4 +-- 2 files changed, 79 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index c1abaf1c6ad..0306d110f0f 100644 --- a/README.md +++ b/README.md @@ -5,35 +5,83 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.11.0`. - ----------------------------------------------- - | ICU Analysis Plugin | ElasticSearch | - ----------------------------------------------- - | 1.12.0-SNAPSHOT (master) | 0.90.3 -> master | - ----------------------------------------------- - | 1.11.0 | 0.90.3 -> master | - ----------------------------------------------- - | 1.10.0 | 0.90.1 -> 0.90.2 | - ----------------------------------------------- - | 1.9.0 | 0.90.0 | - ----------------------------------------------- - | 1.8.0 | 0.90.0 | - ----------------------------------------------- - | 1.7.0 | 0.19 -> 0.20 | - ----------------------------------------------- - | 1.6.0 | 0.19 -> 0.20 | - ----------------------------------------------- - | 1.5.0 | 0.19 -> 0.20 | - ----------------------------------------------- - | 1.4.0 | 0.19 -> 0.20 | - ----------------------------------------------- - | 1.3.0 | 0.19 -> 0.20 | - ----------------------------------------------- - | 1.2.0 | 0.19 -> 0.20 | - ----------------------------------------------- - | 1.1.0 | 0.18 | - ----------------------------------------------- - | 1.0.0 | 0.18 | - ----------------------------------------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ICU Analysis PluginElasticsearchRelease date
1.12.0-SNAPSHOT (master)0.90.6 -> master
1.11.00.90.3 -> 0.90.52013-08-08
1.10.00.90.1 -> 0.90.22013-05-30
1.9.00.90.02013-04-29
1.8.00.90.02013-02-26
1.7.00.19 -> 0.202012-09-28
1.6.00.19 -> 0.202012-09-27
1.5.00.19 -> 0.202012-04-30
1.4.00.19 -> 0.202012-03-20
1.3.00.19 -> 0.202012-03-20
1.2.00.19 -> 0.202012-02-07
1.1.00.182011-12-13
1.0.00.182011-12-05
ICU Normalization diff --git a/pom.xml b/pom.xml index d30b0cdbdc4..d813773a239 100644 --- a/pom.xml +++ b/pom.xml @@ -31,8 +31,8 @@ - 0.90.3 - 4.4.0 + 0.90.6 + 4.5.1 From b5ae609e4c5518fb77717b1fb336b9475b5de59a Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 6 Nov 2013 10:46:52 +0100 Subject: [PATCH 054/131] prepare release elasticsearch-analysis-icu-1.12.0 --- README.md | 9 +++++++-- pom.xml | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0306d110f0f..19651fd47cd 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ICU Analysis for ElasticSearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.11.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.12.0`. @@ -16,10 +16,15 @@ In order to install the plugin, simply run: `bin/plugin -install elasticsearch/e - + + + + + + diff --git a/pom.xml b/pom.xml index d813773a239..9e51d76d9ad 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0org.elasticsearchelasticsearch-analysis-icu - 1.12.0-SNAPSHOT + 1.12.0jarICU Analysis for ElasticSearch2009 From 3d03f5281146a2d22ab4338e12cba47dd97e68f0 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 6 Nov 2013 10:51:29 +0100 Subject: [PATCH 055/131] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 9e51d76d9ad..81e3f34aa94 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.12.0 + 1.13.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From 14cbe08c08e3b6aba8e599b578a68d54308f964b Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 19 Dec 2013 09:46:47 +0100 Subject: [PATCH 056/131] Update to Elasticsearch 0.90.8 / Lucene 4.6.0 Closes #15. --- README.md | 98 +++---------------- pom.xml | 4 +- .../SimpleIcuCollationTokenFilterTests.java | 6 +- 3 files changed, 23 insertions(+), 85 deletions(-) diff --git a/README.md b/README.md index 19651fd47cd..d125ec0b437 100644 --- a/README.md +++ b/README.md @@ -5,88 +5,22 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.12.0`. - -
1.12.0-SNAPSHOT (master)1.13.0-SNAPSHOT (master) 0.90.6 -> master
1.12.00.90.6 -> master2013-11-06
1.11.0 0.90.3 -> 0.90.5
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ICU Analysis PluginElasticsearchRelease date
1.13.0-SNAPSHOT (master)0.90.6 -> master
1.12.00.90.6 -> master2013-11-06
1.11.00.90.3 -> 0.90.52013-08-08
1.10.00.90.1 -> 0.90.22013-05-30
1.9.00.90.02013-04-29
1.8.00.90.02013-02-26
1.7.00.19 -> 0.202012-09-28
1.6.00.19 -> 0.202012-09-27
1.5.00.19 -> 0.202012-04-30
1.4.00.19 -> 0.202012-03-20
1.3.00.19 -> 0.202012-03-20
1.2.00.19 -> 0.202012-02-07
1.1.00.182011-12-13
1.0.00.182011-12-05
+| ICU Analysis Plugin | elasticsearch | Release date | +|--------------------------|------------------|:------------:| +| 1.13.0-SNAPSHOT (master) | 0.90.8 -> master | 2013-12-19 | +| 1.12.0 | 0.90.6 -> 0.90.7 | 2013-11-06 | +| 1.11.0 | 0.90.3 -> 0.90.5 | 2013-08-08 | +| 1.10.0 | 0.90.1 -> 0.90.2 | 2013-05-30 | +| 1.9.0 | 0.90.0 | 2013-04-29 | +| 1.8.0 | 0.90.0 | 2013-02-26 | +| 1.7.0 | 0.19 -> 0.20 | 2012-09-28 | +| 1.6.0 | 0.19 -> 0.20 | 2012-09-27 | +| 1.5.0 | 0.19 -> 0.20 | 2012-04-30 | +| 1.4.0 | 0.19 -> 0.20 | 2012-03-20 | +| 1.3.0 | 0.19 -> 0.20 | 2012-03-20 | +| 1.2.0 | 0.19 -> 0.20 | 2012-02-07 | +| 1.1.0 | 0.18 | 2011-12-13 | +| 1.0.0 | 0.18 | 2011-12-05 | ICU Normalization diff --git a/pom.xml b/pom.xml index 81e3f34aa94..2a8ae4faf91 100644 --- a/pom.xml +++ b/pom.xml @@ -31,8 +31,8 @@ - 0.90.6 - 4.5.1 + 0.90.8 + 4.6.0 diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java index e4475e853b9..987384c61c1 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java @@ -3,8 +3,8 @@ package org.elasticsearch.index.analysis; import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; -import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.elasticsearch.common.inject.Injector; import org.elasticsearch.common.inject.ModulesBuilder; @@ -290,6 +290,10 @@ public class SimpleIcuCollationTokenFilterTests { .addAttribute(CharTermAttribute.class); CharTermAttribute term2 = stream2 .addAttribute(CharTermAttribute.class); + + stream1.reset(); + stream2.reset(); + assertThat(stream1.incrementToken(), equalTo(true)); assertThat(stream2.incrementToken(), equalTo(true)); assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison))); From c19d5e62f3ebc4deb5a0d5f13733659124db51b8 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 19 Dec 2013 09:56:19 +0100 Subject: [PATCH 057/131] Move tests to JUnit Closes #13. --- pom.xml | 42 ++++++++++--------- .../analysis/SimpleIcuAnalysisTests.java | 7 ++-- .../SimpleIcuCollationTokenFilterTests.java | 6 +-- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/pom.xml b/pom.xml index 2a8ae4faf91..c8b0d974455 100644 --- a/pom.xml +++ b/pom.xml @@ -33,6 +33,11 @@ 0.90.8 4.6.0 + 1 + true + onerror + + INFO @@ -43,6 +48,19 @@ + + org.hamcrest + hamcrest-all + 1.3 + test + + + org.apache.lucene + lucene-test-framework + ${lucene.version} + test + + org.elasticsearch elasticsearch @@ -65,26 +83,10 @@ - org.testng - testng - 6.8 - test - - - org.hamcrest - hamcrest-core - - - junit - junit - - - - - - org.hamcrest - hamcrest-all - 1.3 + org.elasticsearch + elasticsearch + ${elasticsearch.version} + test-jar test diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java index 43df9d270be..4fb6031bc92 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -29,15 +29,14 @@ import org.elasticsearch.index.IndexNameModule; import org.elasticsearch.index.settings.IndexSettingsModule; import org.elasticsearch.indices.analysis.IndicesAnalysisModule; import org.elasticsearch.indices.analysis.IndicesAnalysisService; -import org.testng.annotations.Test; +import org.elasticsearch.test.ElasticsearchTestCase; +import org.junit.Test; import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS; -import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.instanceOf; - /** */ -public class SimpleIcuAnalysisTests { +public class SimpleIcuAnalysisTests extends ElasticsearchTestCase { @Test public void testDefaultsIcuAnalysis() { diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java index 987384c61c1..51fe755e9d6 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java @@ -18,16 +18,16 @@ import org.elasticsearch.index.IndexNameModule; import org.elasticsearch.index.settings.IndexSettingsModule; import org.elasticsearch.indices.analysis.IndicesAnalysisModule; import org.elasticsearch.indices.analysis.IndicesAnalysisService; -import org.testng.annotations.Test; +import org.elasticsearch.test.ElasticsearchTestCase; +import org.junit.Test; import java.io.IOException; import java.io.StringReader; -import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.equalTo; // Tests borrowed from Solr's Icu collation key filter factory test. -public class SimpleIcuCollationTokenFilterTests { +public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { /* * Turkish has some funny casing. From 914654b756276fae3654677b2b42627c28d11253 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 19 Dec 2013 10:00:26 +0100 Subject: [PATCH 058/131] prepare release elasticsearch-analysis-icu-1.13.0 --- README.md | 5 +++-- pom.xml | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d125ec0b437..6a83f15ecda 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,12 @@ ICU Analysis for ElasticSearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.12.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.13.0`. | ICU Analysis Plugin | elasticsearch | Release date | |--------------------------|------------------|:------------:| -| 1.13.0-SNAPSHOT (master) | 0.90.8 -> master | 2013-12-19 | +| 1.14.0-SNAPSHOT (master) | 0.90.8 -> master | | +| 1.13.0 | 0.90.8 -> master | 2013-12-19 | | 1.12.0 | 0.90.6 -> 0.90.7 | 2013-11-06 | | 1.11.0 | 0.90.3 -> 0.90.5 | 2013-08-08 | | 1.10.0 | 0.90.1 -> 0.90.2 | 2013-05-30 | diff --git a/pom.xml b/pom.xml index c8b0d974455..b30936e2728 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.13.0-SNAPSHOT + 1.13.0 jar ICU Analysis for ElasticSearch 2009 From 1dd683a03dc735fcc22764385d643c43cfff03c8 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 19 Dec 2013 10:04:19 +0100 Subject: [PATCH 059/131] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index b30936e2728..4c748de61a9 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.13.0 + 1.14.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From 6d05c2bf53a81caf04c0b61175e22f60b5aafa4e Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 19 Dec 2013 10:50:28 +0100 Subject: [PATCH 060/131] Move tests to JUnit Related to #13. --- .gitignore | 1 + pom.xml | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 88 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 3916fcee60b..de7d76e25b7 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ /.settings /.classpath /.project +/.local-execution-hints.log diff --git a/pom.xml b/pom.xml index 4c748de61a9..4027ca2f957 100644 --- a/pom.xml +++ b/pom.xml @@ -104,15 +104,98 @@ + com.carrotsearch.randomizedtesting + junit4-maven-plugin + 2.0.12 + + + tests + test + + junit4 + + + 20 + pipe,warn + true + + + + + + + + + ${tests.jvms} + + + + + + + **/*Tests.class + **/*Test.class + + + **/Abstract*.class + **/*StressTest.class + + + -Xmx512m + -XX:MaxDirectMemorySize=512m + -Des.logger.prefix= + + ${tests.shuffle} + ${tests.verbose} + ${tests.seed} + ${tests.failfast} + + + ${tests.iters} + ${tests.maxfailures} + ${tests.failfast} + ${tests.class} + ${tests.method} + ${tests.nightly} + ${tests.badapples} + ${tests.weekly} + ${tests.slow} + ${tests.awaitsfix} + ${tests.slow} + ${tests.timeoutSuite} + ${tests.showSuccess} + ${tests.integration} + ${tests.cluster_seed} + ${tests.client.ratio} + ${env.ES_TEST_LOCAL} + ${es.node.mode} + ${es.logger.level} + true + + + + + + + org.apache.maven.plugins maven-surefire-plugin - 2.12.3 + 2.15 - - **/*Tests.java - + true + org.apache.maven.plugins maven-source-plugin From 114a120fe880f45e075493a99e6d03b5d69ab08b Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 10 Jan 2014 23:18:52 +0100 Subject: [PATCH 061/131] Update headers --- README.md | 8 ++++---- .../analysis/IcuAnalysisBinderProcessor.java | 14 +++++++------- .../IcuCollationTokenFilterFactory.java | 14 +++++++------- .../IcuFoldingTokenFilterFactory.java | 16 ++++++++-------- .../IcuNormalizerTokenFilterFactory.java | 16 ++++++++-------- .../index/analysis/IcuTokenizerFactory.java | 15 ++++++++------- .../IcuTransformTokenFilterFactory.java | 16 ++++++++-------- .../indices/analysis/IcuIndicesAnalysis.java | 19 +++++++++++++++++++ .../analysis/IcuIndicesAnalysisModule.java | 14 +++++++------- .../analysis/icu/AnalysisICUPlugin.java | 14 +++++++------- .../analysis/SimpleIcuAnalysisTests.java | 14 +++++++------- .../SimpleIcuCollationTokenFilterTests.java | 19 +++++++++++++++++++ 12 files changed, 109 insertions(+), 70 deletions(-) diff --git a/README.md b/README.md index 6a83f15ecda..7dafd962a7a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -ICU Analysis for ElasticSearch +ICU Analysis for Elasticsearch ================================== The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. @@ -7,8 +7,8 @@ In order to install the plugin, simply run: `bin/plugin -install elasticsearch/e | ICU Analysis Plugin | elasticsearch | Release date | |--------------------------|------------------|:------------:| -| 1.14.0-SNAPSHOT (master) | 0.90.8 -> master | | -| 1.13.0 | 0.90.8 -> master | 2013-12-19 | +| 1.14.0-SNAPSHOT (master) | 0.90.8 -> 0.90 | | +| 1.13.0 | 0.90.8 -> 0.90 | 2013-12-19 | | 1.12.0 | 0.90.6 -> 0.90.7 | 2013-11-06 | | 1.11.0 | 0.90.3 -> 0.90.5 | 2013-08-08 | | 1.10.0 | 0.90.1 -> 0.90.2 | 2013-05-30 | @@ -176,7 +176,7 @@ License This software is licensed under the Apache 2 license, quoted below. - Copyright 2009-2013 Shay Banon and ElasticSearch + Copyright 2009-2014 Elasticsearch Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java index c032c45ec2c..f23d32b5c74 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java @@ -1,11 +1,11 @@ /* - * Licensed to ElasticSearch and Shay Banon under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java index d756e978b64..d60f1f6d96c 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -1,11 +1,11 @@ /* - * Licensed to ElasticSearch and Shay Banon under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java index f543654dbc3..6badfd48c12 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java @@ -1,11 +1,11 @@ /* - * Licensed to Elastic Search and Shay Banon under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. Elastic Search licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * @@ -69,4 +69,4 @@ public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory { return new ICUFoldingFilter(tokenStream); } } -} \ No newline at end of file +} diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java index b28e7c92122..7f25886fc1d 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java @@ -1,11 +1,11 @@ /* - * Licensed to ElasticSearch and Shay Banon under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * @@ -49,4 +49,4 @@ public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory public TokenStream create(TokenStream tokenStream) { return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE)); } -} \ No newline at end of file +} diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java index 440fbb4084e..168e85f8d2a 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java @@ -1,11 +1,11 @@ /* - * Licensed to ElasticSearch and Shay Banon under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * @@ -16,6 +16,7 @@ * specific language governing permissions and limitations * under the License. */ + package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.Tokenizer; diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java index 17f727c446a..8ef48539daf 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java @@ -1,11 +1,11 @@ /* - * Licensed to ElasticSearch and Shay Banon under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * @@ -50,4 +50,4 @@ public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory { public TokenStream create(TokenStream tokenStream) { return new ICUTransformFilter(tokenStream, transliterator); } -} \ No newline at end of file +} diff --git a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java index 21ed87f854f..eb0edf76185 100644 --- a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java +++ b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java @@ -1,3 +1,22 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.elasticsearch.indices.analysis; import com.ibm.icu.text.Collator; diff --git a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java index 5547df665a0..e7587205a04 100644 --- a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java +++ b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java @@ -1,11 +1,11 @@ /* - * Licensed to ElasticSearch and Shay Banon under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * diff --git a/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java index 975bcf14b5d..a1cafe6508c 100644 --- a/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java +++ b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java @@ -1,11 +1,11 @@ /* - * Licensed to ElasticSearch and Shay Banon under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java index 4fb6031bc92..99f46ebffbe 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -1,11 +1,11 @@ /* - * Licensed to ElasticSearch and Shay Banon under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java index 51fe755e9d6..39440e1e723 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java @@ -1,3 +1,22 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.elasticsearch.index.analysis; import com.ibm.icu.text.Collator; From 51a96c1472a7ad0e8b1565696b2d6434344a06b7 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 10 Jan 2014 23:21:00 +0100 Subject: [PATCH 062/131] Preparing branch 1.x --- README.md | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 7dafd962a7a..e74ac2b165f 100644 --- a/README.md +++ b/README.md @@ -5,23 +5,24 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.13.0`. -| ICU Analysis Plugin | elasticsearch | Release date | -|--------------------------|------------------|:------------:| -| 1.14.0-SNAPSHOT (master) | 0.90.8 -> 0.90 | | -| 1.13.0 | 0.90.8 -> 0.90 | 2013-12-19 | -| 1.12.0 | 0.90.6 -> 0.90.7 | 2013-11-06 | -| 1.11.0 | 0.90.3 -> 0.90.5 | 2013-08-08 | -| 1.10.0 | 0.90.1 -> 0.90.2 | 2013-05-30 | -| 1.9.0 | 0.90.0 | 2013-04-29 | -| 1.8.0 | 0.90.0 | 2013-02-26 | -| 1.7.0 | 0.19 -> 0.20 | 2012-09-28 | -| 1.6.0 | 0.19 -> 0.20 | 2012-09-27 | -| 1.5.0 | 0.19 -> 0.20 | 2012-04-30 | -| 1.4.0 | 0.19 -> 0.20 | 2012-03-20 | -| 1.3.0 | 0.19 -> 0.20 | 2012-03-20 | -| 1.2.0 | 0.19 -> 0.20 | 2012-02-07 | -| 1.1.0 | 0.18 | 2011-12-13 | -| 1.0.0 | 0.18 | 2011-12-05 | +| ICU Analysis Plugin | elasticsearch | Release date | +|-----------------------------|---------------------|:------------:| +| 2.0.0.RC1-SNAPSHOT (master) | 1.0.0.RC1 -> master | | +| 1.14.0-SNAPSHOT (1.x) | 0.90.8 -> 0.90 | | +| 1.13.0 | 0.90.8 -> 0.90 | 2013-12-19 | +| 1.12.0 | 0.90.6 -> 0.90.7 | 2013-11-06 | +| 1.11.0 | 0.90.3 -> 0.90.5 | 2013-08-08 | +| 1.10.0 | 0.90.1 -> 0.90.2 | 2013-05-30 | +| 1.9.0 | 0.90.0 | 2013-04-29 | +| 1.8.0 | 0.90.0 | 2013-02-26 | +| 1.7.0 | 0.19 -> 0.20 | 2012-09-28 | +| 1.6.0 | 0.19 -> 0.20 | 2012-09-27 | +| 1.5.0 | 0.19 -> 0.20 | 2012-04-30 | +| 1.4.0 | 0.19 -> 0.20 | 2012-03-20 | +| 1.3.0 | 0.19 -> 0.20 | 2012-03-20 | +| 1.2.0 | 0.19 -> 0.20 | 2012-02-07 | +| 1.1.0 | 0.18 | 2011-12-13 | +| 1.0.0 | 0.18 | 2011-12-05 | ICU Normalization From 4c935133e387e125e077f7fceefdb72a3b31ce0a Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 10 Jan 2014 23:23:07 +0100 Subject: [PATCH 063/131] Update to elasticsearch 1.0.0.RC1 Closes #16. --- pom.xml | 4 ++-- .../analysis/IcuCollationTokenFilterFactory.java | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pom.xml b/pom.xml index 4027ca2f957..bdcd940acbe 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 1.14.0-SNAPSHOT + 2.0.0.RC1-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 @@ -31,7 +31,7 @@ - 0.90.8 + 1.0.0.RC1-SNAPSHOT 4.6.0 1 true diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java index d60f1f6d96c..0e2a9799daf 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -24,7 +24,7 @@ import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.collation.ICUCollationKeyFilter; -import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; @@ -63,15 +63,15 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { } catch (FailedToResolveConfigException e) { failureToResolve = e; } catch (IOException e) { - throw new ElasticSearchIllegalArgumentException("Failed to load collation rules", e); + throw new ElasticsearchIllegalArgumentException("Failed to load collation rules", e); } try { collator = new RuleBasedCollator(rules); } catch (Exception e) { if (failureToResolve != null) { - throw new ElasticSearchIllegalArgumentException("Failed to resolve collation rules location", failureToResolve); + throw new ElasticsearchIllegalArgumentException("Failed to resolve collation rules location", failureToResolve); } else { - throw new ElasticSearchIllegalArgumentException("Failed to parse collation rules", e); + throw new ElasticsearchIllegalArgumentException("Failed to parse collation rules", e); } } } else { @@ -109,7 +109,7 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { } else if (strength.equalsIgnoreCase("identical")) { collator.setStrength(Collator.IDENTICAL); } else { - throw new ElasticSearchIllegalArgumentException("Invalid strength: " + strength); + throw new ElasticsearchIllegalArgumentException("Invalid strength: " + strength); } } @@ -121,7 +121,7 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { } else if (decomposition.equalsIgnoreCase("canonical")) { collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); } else { - throw new ElasticSearchIllegalArgumentException("Invalid decomposition: " + decomposition); + throw new ElasticsearchIllegalArgumentException("Invalid decomposition: " + decomposition); } } @@ -134,7 +134,7 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { } else if (alternate.equalsIgnoreCase("non-ignorable")) { rbc.setAlternateHandlingShifted(false); } else { - throw new ElasticSearchIllegalArgumentException("Invalid alternate: " + alternate); + throw new ElasticsearchIllegalArgumentException("Invalid alternate: " + alternate); } } @@ -150,7 +150,7 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { } else if (caseFirst.equalsIgnoreCase("upper")) { rbc.setUpperCaseFirst(true); } else { - throw new ElasticSearchIllegalArgumentException("Invalid caseFirst: " + caseFirst); + throw new ElasticsearchIllegalArgumentException("Invalid caseFirst: " + caseFirst); } } From f88f5e2fb7a5c04aff1536e96967cbba7c74b423 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 10 Jan 2014 23:30:05 +0100 Subject: [PATCH 064/131] Fix doc --- README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e74ac2b165f..67f184db625 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ In order to install the plugin, simply run: `bin/plugin -install elasticsearch/e ICU Normalization ----------------- -Normalizes characters as explained "here":http://userguide.icu-project.org/transforms/normalization. It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings. Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`. Here is a sample settings: +Normalizes characters as explained [here](http://userguide.icu-project.org/transforms/normalization). It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings. Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`. Here is a sample settings: { "index" : { @@ -64,7 +64,9 @@ Folding of unicode characters based on `UTR#30`. It registers itself under `icu_ ICU Filtering ------------- -The folding can be filtered by a set of unicode characters with the parameter `unicodeSetFilter`. This is useful for a non-internationalized search engine where retaining a set of national characters which are primary letters in a specific language is wanted. See syntax for the UnicodeSet "here":http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html. +The folding can be filtered by a set of unicode characters with the parameter `unicodeSetFilter`. This is useful for a +non-internationalized search engine where retaining a set of national characters which are primary letters in a specific +language is wanted. See syntax for the UnicodeSet [here](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html). The Following example exempts Swedish characters from the folding. Note that the filtered characters are NOT lowercased which is why we add that filter below. @@ -90,7 +92,11 @@ The Following example exempts Swedish characters from the folding. Note that the ICU Collation ------------- -Uses collation token filter. Allows to either specify the rules for collation (defined "here":http://www.icu-project.org/userguide/Collate_Customization.html) using the `rules` parameter (can point to a location or expressed in the settings, location can be relative to config location), or using the `language` parameter (further specialized by country and variant). By default registers under `icu_collation` or `icuCollation` and uses the default locale. +Uses collation token filter. Allows to either specify the rules for collation +(defined [here](http://www.icu-project.org/userguide/Collate_Customization.html)) using the `rules` parameter +(can point to a location or expressed in the settings, location can be relative to config location), or using the +`language` parameter (further specialized by country and variant). By default registers under `icu_collation` or +`icuCollation` and uses the default locale. Here is a sample settings: @@ -132,7 +138,7 @@ Optional options: * `strength` - The strength property determines the minimum level of difference considered significant during comparison. The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator. Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`. - See ICU Collation:http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html documentation for a more detailed + See [ICU Collation](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html) documentation for a more detailed explanation for the specific values. * `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with `canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were @@ -157,7 +163,7 @@ Expert options: ICU Tokenizer ------------- -Breaks text into words according to UAX #29: Unicode Text Segmentation ((http://www.unicode.org/reports/tr29/)). +Breaks text into words according to [UAX #29: Unicode Text Segmentation](http://www.unicode.org/reports/tr29/). { "index" : { From 8561f897a984807958a543e378ab01349935a739 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 15 Jan 2014 18:22:36 +0100 Subject: [PATCH 065/131] prepare release elasticsearch-analysis-icu-2.0.0.RC1 --- README.md | 5 +++-- pom.xml | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 67f184db625..dbdcdcd681f 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,12 @@ ICU Analysis for Elasticsearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.13.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.0.0.RC1`. | ICU Analysis Plugin | elasticsearch | Release date | |-----------------------------|---------------------|:------------:| -| 2.0.0.RC1-SNAPSHOT (master) | 1.0.0.RC1 -> master | | +| 2.0.0-SNAPSHOT (master) | 1.0.0.RC1 -> master | | +| 2.0.0.RC1 | 1.0.0.RC1 -> master | 2014-01-15 | | 1.14.0-SNAPSHOT (1.x) | 0.90.8 -> 0.90 | | | 1.13.0 | 0.90.8 -> 0.90 | 2013-12-19 | | 1.12.0 | 0.90.6 -> 0.90.7 | 2013-11-06 | diff --git a/pom.xml b/pom.xml index bdcd940acbe..19bf21b07ce 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 2.0.0.RC1-SNAPSHOT + 2.0.0.RC1 jar ICU Analysis for ElasticSearch 2009 @@ -31,7 +31,7 @@ - 1.0.0.RC1-SNAPSHOT + 1.0.0.RC1 4.6.0 1 true From 5e102f53eec60041d35f81e3329e24b4ba734383 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 15 Jan 2014 19:52:23 +0100 Subject: [PATCH 066/131] prepare for next development iteration --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 19bf21b07ce..d40c381fb7b 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 2.0.0.RC1 + 2.0.0-SNAPSHOT jar ICU Analysis for ElasticSearch 2009 From c962f0b8afd10a56e87d413ab50190b7ba6db2e0 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 28 Feb 2014 23:09:56 +0100 Subject: [PATCH 067/131] Add plugin release semi-automatic script Closes #21 --- README.md | 20 +- dev-tools/build_release.py | 708 +++++++++++++++++++++++++++++++++++++ dev-tools/upload-s3.py | 67 ++++ pom.xml | 5 +- 4 files changed, 782 insertions(+), 18 deletions(-) create mode 100755 dev-tools/build_release.py create mode 100644 dev-tools/upload-s3.py diff --git a/README.md b/README.md index dbdcdcd681f..2a898b5fd5a 100644 --- a/README.md +++ b/README.md @@ -5,25 +5,13 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.0.0.RC1`. +* For 1.0.x elasticsearch versions, look at [master branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/master). +* For 0.90.x elasticsearch versions, look at [1.x branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/1.x). + | ICU Analysis Plugin | elasticsearch | Release date | |-----------------------------|---------------------|:------------:| -| 2.0.0-SNAPSHOT (master) | 1.0.0.RC1 -> master | | +| 2.0.0-SNAPSHOT | 1.0.0.RC1 -> master | XXXX-XX-XX | | 2.0.0.RC1 | 1.0.0.RC1 -> master | 2014-01-15 | -| 1.14.0-SNAPSHOT (1.x) | 0.90.8 -> 0.90 | | -| 1.13.0 | 0.90.8 -> 0.90 | 2013-12-19 | -| 1.12.0 | 0.90.6 -> 0.90.7 | 2013-11-06 | -| 1.11.0 | 0.90.3 -> 0.90.5 | 2013-08-08 | -| 1.10.0 | 0.90.1 -> 0.90.2 | 2013-05-30 | -| 1.9.0 | 0.90.0 | 2013-04-29 | -| 1.8.0 | 0.90.0 | 2013-02-26 | -| 1.7.0 | 0.19 -> 0.20 | 2012-09-28 | -| 1.6.0 | 0.19 -> 0.20 | 2012-09-27 | -| 1.5.0 | 0.19 -> 0.20 | 2012-04-30 | -| 1.4.0 | 0.19 -> 0.20 | 2012-03-20 | -| 1.3.0 | 0.19 -> 0.20 | 2012-03-20 | -| 1.2.0 | 0.19 -> 0.20 | 2012-02-07 | -| 1.1.0 | 0.18 | 2011-12-13 | -| 1.0.0 | 0.18 | 2011-12-05 | ICU Normalization diff --git a/dev-tools/build_release.py b/dev-tools/build_release.py new file mode 100755 index 00000000000..9166b09e7e3 --- /dev/null +++ b/dev-tools/build_release.py @@ -0,0 +1,708 @@ +# Licensed to Elasticsearch under one or more contributor +# license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright +# ownership. Elasticsearch licenses this file to you under +# the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on +# an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +# either express or implied. See the License for the specific +# language governing permissions and limitations under the License. + +import re +import tempfile +import shutil +import os +import datetime +import argparse +import github3 +import smtplib + +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText + +from os.path import dirname, abspath + +""" + This tool builds a release from the a given elasticsearch plugin branch. + In order to execute it go in the top level directory and run: + $ python3 dev_tools/build_release.py --branch master --publish --remote origin + + By default this script runs in 'dry' mode which essentially simulates a release. If the + '--publish' option is set the actual release is done. + If not in 'dry' mode, a mail will be automatically sent to the mailing list. + You can disable it with the option '--disable_mail' + + $ python3 dev_tools/build_release.py --publish --remote origin --disable_mail + + The script takes over almost all + steps necessary for a release from a high level point of view it does the following things: + + - run prerequisite checks ie. check for Java 1.6 being present or S3 credentials available as env variables + - detect the version to release from the specified branch (--branch) or the current branch + - creates a release branch & updates pom.xml and README.md to point to a release version rather than a snapshot + - builds the artifacts + - commits the new version and merges the release branch into the source branch + - creates a tag and pushes the commit to the specified origin (--remote) + - publishes the releases to sonatype and S3 + - send a mail based on github issues fixed by this version + +Once it's done it will print all the remaining steps. + + Prerequisites: + - Python 3k for script execution + - Boto for S3 Upload ($ apt-get install python-boto or pip-3.3 install boto) + - github3 module (pip-3.3 install github3.py) + - S3 keys exported via ENV Variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) + - GITHUB (login/password) or key exported via ENV Variables (GITHUB_LOGIN, GITHUB_PASSWORD or GITHUB_KEY) + (see https://github.com/settings/applications#personal-access-tokens) - Optional: default to no authentication + - SMTP_HOST - Optional: default to localhost + - MAIL_SENDER - Optional: default to 'david@pilato.fr': must be authorized to send emails to elasticsearch mailing list + - MAIL_TO - Optional: default to 'elasticsearch@googlegroups.com' +""" +env = os.environ + +LOG = env.get('ES_RELEASE_LOG', '/tmp/elasticsearch_release.log') +ROOT_DIR = os.path.join(abspath(dirname(__file__)), '../') +README_FILE = ROOT_DIR + 'README.md' +POM_FILE = ROOT_DIR + 'pom.xml' + +def log(msg): + log_plain('\n%s' % msg) + +def log_plain(msg): + f = open(LOG, mode='ab') + f.write(msg.encode('utf-8')) + f.close() + +def run(command, quiet=False): + log('%s: RUN: %s\n' % (datetime.datetime.now(), command)) + if os.system('%s >> %s 2>&1' % (command, LOG)): + msg = ' FAILED: %s [see log %s]' % (command, LOG) + if not quiet: + print(msg) + raise RuntimeError(msg) + +try: + JAVA_HOME = env['JAVA_HOME'] +except KeyError: + raise RuntimeError(""" + Please set JAVA_HOME in the env before running release tool + On OSX use: export JAVA_HOME=`/usr/libexec/java_home -v '1.6*'`""") + +try: + MVN='mvn' + # make sure mvn3 is used if mvn3 is available + # some systems use maven 2 as default + run('mvn3 --version', quiet=True) + MVN='mvn3' +except RuntimeError: + pass + + +def java_exe(): + path = JAVA_HOME + return 'export JAVA_HOME="%s" PATH="%s/bin:$PATH" JAVACMD="%s/bin/java"' % (path, path, path) + +def verify_java_version(version): + s = os.popen('%s; java -version 2>&1' % java_exe()).read() + if s.find(' version "%s.' % version) == -1: + raise RuntimeError('got wrong version for java %s:\n%s' % (version, s)) + +# Verifies the java version. We guarantee that we run with Java 1.6 +# If 1.6 is not available fail the build! +def verify_mvn_java_version(version, mvn): + s = os.popen('%s; %s --version 2>&1' % (java_exe(), mvn)).read() + if s.find('Java version: %s' % version) == -1: + raise RuntimeError('got wrong java version for %s %s:\n%s' % (mvn, version, s)) + +# Returns the hash of the current git HEAD revision +def get_head_hash(): + return os.popen(' git rev-parse --verify HEAD 2>&1').read().strip() + +# Returns the hash of the given tag revision +def get_tag_hash(tag): + return os.popen('git show-ref --tags %s --hash 2>&1' % (tag)).read().strip() + +# Returns the name of the current branch +def get_current_branch(): + return os.popen('git rev-parse --abbrev-ref HEAD 2>&1').read().strip() + +verify_java_version('1.6') # we require to build with 1.6 +verify_mvn_java_version('1.6', MVN) + +# Utility that returns the name of the release branch for a given version +def release_branch(version): + return 'release_branch_%s' % version + +# runs get fetch on the given remote +def fetch(remote): + run('git fetch %s' % remote) + +# Creates a new release branch from the given source branch +# and rebases the source branch from the remote before creating +# the release branch. Note: This fails if the source branch +# doesn't exist on the provided remote. +def create_release_branch(remote, src_branch, release): + run('git checkout %s' % src_branch) + run('git pull --rebase %s %s' % (remote, src_branch)) + run('git checkout -b %s' % (release_branch(release))) + + +# Reads the given file and applies the +# callback to it. If the callback changed +# a line the given file is replaced with +# the modified input. +def process_file(file_path, line_callback): + fh, abs_path = tempfile.mkstemp() + modified = False + with open(abs_path,'w', encoding='utf-8') as new_file: + with open(file_path, encoding='utf-8') as old_file: + for line in old_file: + new_line = line_callback(line) + modified = modified or (new_line != line) + new_file.write(new_line) + os.close(fh) + if modified: + #Remove original file + os.remove(file_path) + #Move new file + shutil.move(abs_path, file_path) + return True + else: + # nothing to do - just remove the tmp file + os.remove(abs_path) + return False + +# Guess the next snapshot version number (increment second digit) +def guess_snapshot(version): + digits=list(map(int, re.findall(r'\d+', version))) + source='%s.%s' % (digits[0], digits[1]) + destination='%s.%s' % (digits[0], digits[1]+1) + return version.replace(source, destination) + +# Moves the pom.xml file from a snapshot to a release +def remove_maven_snapshot(pom, release): + pattern = '%s-SNAPSHOT' % release + replacement = '%s' % release + def callback(line): + return line.replace(pattern, replacement) + process_file(pom, callback) + +# Moves the README.md file from a snapshot to a release +def remove_version_snapshot(readme_file, release): + pattern = '%s-SNAPSHOT' % release + replacement = '%s ' % release + def callback(line): + return line.replace(pattern, replacement) + process_file(readme_file, callback) + +# Moves the pom.xml file to the next snapshot +def add_maven_snapshot(pom, release, snapshot): + pattern = '%s' % release + replacement = '%s-SNAPSHOT' % snapshot + def callback(line): + return line.replace(pattern, replacement) + process_file(pom, callback) + +# Add in README.md file the next snapshot +def add_version_snapshot(readme_file, release, snapshot): + pattern = '| %s ' % release + replacement = '| %s-SNAPSHOT' % snapshot + def callback(line): + # If we find pattern, we copy the line and replace its content + if line.find(pattern) >= 0: + return line.replace(pattern, replacement).replace('%s' % (datetime.datetime.now().strftime("%Y-%m-%d")), + 'XXXX-XX-XX')+line + else: + return line + process_file(readme_file, callback) + + +# Set release date in README.md file +def set_date(readme_file): + pattern = 'XXXX-XX-XX' + replacement = '%s' % (datetime.datetime.now().strftime("%Y-%m-%d")) + def callback(line): + return line.replace(pattern, replacement) + process_file(readme_file, callback) + +# Update installation instructions in README.md file +def set_install_instructions(readme_file, artifact_name, release): + pattern = '`bin/plugin -install elasticsearch/%s/.+`' % artifact_name + replacement = '`bin/plugin -install elasticsearch/%s/%s`' % (artifact_name, release) + def callback(line): + return re.sub(pattern, replacement, line) + process_file(readme_file, callback) + + +# Stages the given files for the next git commit +def add_pending_files(*files): + for file in files: + run('git add %s' % file) + +# Executes a git commit with 'release [version]' as the commit message +def commit_release(artifact_id, release): + run('git commit -m "prepare release %s-%s"' % (artifact_id, release)) + +def commit_snapshot(): + run('git commit -m "prepare for next development iteration"') + +def tag_release(release): + run('git tag -a v%s -m "Tag release version %s"' % (release, release)) + +def run_mvn(*cmd): + for c in cmd: + run('%s; %s -f %s %s' % (java_exe(), MVN, POM_FILE, c)) + +def build_release(run_tests=False, dry_run=True): + target = 'deploy' + if dry_run: + target = 'package' + if run_tests: + run_mvn('clean test') + run_mvn('clean %s -DskipTests' %(target)) + +# Checks the pom.xml for the release version. 2.0.0-SNAPSHOT +# This method fails if the pom file has no SNAPSHOT version set ie. +# if the version is already on a release version we fail. +# Returns the next version string ie. 0.90.7 +def find_release_version(src_branch): + run('git checkout %s' % src_branch) + with open(POM_FILE, encoding='utf-8') as file: + for line in file: + match = re.search(r'(.+)-SNAPSHOT', line) + if match: + return match.group(1) + raise RuntimeError('Could not find release version in branch %s' % src_branch) + +# extract a value from pom.xml +def find_from_pom(tag): + with open(POM_FILE, encoding='utf-8') as file: + for line in file: + match = re.search(r'<%s>(.+)' % (tag, tag), line) + if match: + return match.group(1) + raise RuntimeError('Could not find <%s> in pom.xml file' % (tag)) + +def get_artifacts(artifact_id, release): + artifact_path = ROOT_DIR + 'target/releases/%s-%s.zip' % (artifact_id, release) + print(' Path %s' % (artifact_path)) + if not os.path.isfile(artifact_path): + raise RuntimeError('Could not find required artifact at %s' % (artifact_path)) + return artifact_path + +# Generates sha1 for a file +# and returns the checksum files as well +# as the given files in a list +def generate_checksums(release_file): + res = [] + directory = os.path.dirname(release_file) + file = os.path.basename(release_file) + checksum_file = '%s.sha1.txt' % file + + if os.system('cd %s; shasum %s > %s' % (directory, file, checksum_file)): + raise RuntimeError('Failed to generate checksum for file %s' % release_file) + res = res + [os.path.join(directory, checksum_file), release_file] + return res + +def git_merge(src_branch, release_version): + run('git checkout %s' % src_branch) + run('git merge %s' % release_branch(release_version)) + +def git_push(remote, src_branch, release_version, dry_run): + if not dry_run: + run('git push %s %s' % (remote, src_branch)) # push the commit + run('git push %s v%s' % (remote, release_version)) # push the tag + else: + print(' dryrun [True] -- skipping push to remote %s' % remote) + +def publish_artifacts(artifacts, base='elasticsearch/elasticsearch', dry_run=True): + location = os.path.dirname(os.path.realpath(__file__)) + for artifact in artifacts: + if dry_run: + print('Skip Uploading %s to Amazon S3 in %s' % (artifact, base)) + else: + print('Uploading %s to Amazon S3' % artifact) + # requires boto to be installed but it is not available on python3k yet so we use a dedicated tool + run('python %s/upload-s3.py --file %s --path %s' % (location, os.path.abspath(artifact), base)) + + +################# +## +## +## Email and Github Management +## +## +################# +def format_issues_plain(issues, title='Fix'): + response = "" + + if len(issues) > 0: + response += '%s:\n' % title + for issue in issues: + response += ' * [%s] - %s (%s)\n' % (issue.number, issue.title, issue.html_url) + + return response + +def format_issues_html(issues, title='Fix'): + response = "" + + if len(issues) > 0: + response += '

%s

\n\n' + + return response + +def get_github_repository(reponame, + login=env.get('GITHUB_LOGIN', None), + password=env.get('GITHUB_PASSWORD', None), + key=env.get('GITHUB_KEY', None)): + if login: + g = github3.login(login, password) + elif key: + g = github3.login(token=key) + else: + g = github3.GitHub() + + return g.repository("elasticsearch", reponame) + +# Check if there are some remaining open issues and fails +def check_opened_issues(version, repository, reponame): + opened_issues = [i for i in repository.iter_issues(state='open', labels='%s' % version)] + if len(opened_issues)>0: + raise NameError('Some issues [%s] are still opened. Check https://github.com/elasticsearch/%s/issues?labels=%s&state=open' + % (len(opened_issues), reponame, version)) + +# List issues from github: can be done anonymously if you don't +# exceed a given number of github API calls per day +# Check if there are some remaining open issues and fails +def list_issues(version, + repository, + severity='bug'): + issues = [i for i in repository.iter_issues(state='closed', labels='%s,%s' % (severity, version))] + return issues + +# Get issues from github and generates a Plain/HTML Multipart email +# And send it if dry_run=False +def prepare_email(artifact_id, release_version, repository, + artifact_name, artifact_description, project_url, + severity_labels_bug='bug', + severity_labels_update='update', + severity_labels_new='new', + severity_labels_doc='doc'): + + ## Get bugs from github + issues_bug = list_issues(release_version, repository, severity=severity_labels_bug) + issues_update = list_issues(release_version, repository, severity=severity_labels_update) + issues_new = list_issues(release_version, repository, severity=severity_labels_new) + issues_doc = list_issues(release_version, repository, severity=severity_labels_doc) + + ## Format content to plain text + plain_issues_bug = format_issues_plain(issues_bug, 'Fix') + plain_issues_update = format_issues_plain(issues_update, 'Update') + plain_issues_new = format_issues_plain(issues_new, 'New') + plain_issues_doc = format_issues_plain(issues_doc, 'Doc') + + ## Format content to html + html_issues_bug = format_issues_html(issues_bug, 'Fix') + html_issues_update = format_issues_html(issues_update, 'Update') + html_issues_new = format_issues_html(issues_new, 'New') + html_issues_doc = format_issues_html(issues_doc, 'Doc') + + if len(issues_bug)+len(issues_update)+len(issues_new)+len(issues_doc) > 0: + plain_empty_message = "" + html_empty_message = "" + + else: + plain_empty_message = "No issue listed for this release" + html_empty_message = "

No issue listed for this release

" + + msg = MIMEMultipart('alternative') + msg['Subject'] = '[ANN] %s %s released' % (artifact_name, release_version) + text = """ +Heya, + + +We are pleased to announce the release of the %(artifact_name)s, version %(release_version)s. + +%(artifact_description)s. + +%(project_url)s + +Release Notes - %(artifact_id)s - Version %(release_version)s + +%(empty_message)s +%(issues_bug)s +%(issues_update)s +%(issues_new)s +%(issues_doc)s + +Issues, Pull requests, Feature requests are warmly welcome on %(artifact_id)s project repository: %(project_url)s +For questions or comments around this plugin, feel free to use elasticsearch mailing list: https://groups.google.com/forum/#!forum/elasticsearch + +Enjoy, + +-The Elasticsearch team +""" % {'release_version': release_version, + 'artifact_id': artifact_id, + 'artifact_name': artifact_name, + 'artifact_description': artifact_description, + 'project_url': project_url, + 'empty_message': plain_empty_message, + 'issues_bug': plain_issues_bug, + 'issues_update': plain_issues_update, + 'issues_new': plain_issues_new, + 'issues_doc': plain_issues_doc} + + html = """ + + +

Heya,

+ +

We are pleased to announce the release of the %(artifact_name)s, version %(release_version)s

+ +
%(artifact_description)s.
+ +

Release Notes - Version %(release_version)s

+%(empty_message)s +%(issues_bug)s +%(issues_update)s +%(issues_new)s +%(issues_doc)s + +

Issues, Pull requests, Feature requests are warmly welcome on +%(artifact_id)s project repository!

+

For questions or comments around this plugin, feel free to use elasticsearch +mailing list!

+ +

Enjoy,

+ +

- The Elasticsearch team

+ +""" % {'release_version': release_version, + 'artifact_id': artifact_id, + 'artifact_name': artifact_name, + 'artifact_description': artifact_description, + 'project_url': project_url, + 'empty_message': html_empty_message, + 'issues_bug': html_issues_bug, + 'issues_update': html_issues_update, + 'issues_new': html_issues_new, + 'issues_doc': html_issues_doc} + + # Record the MIME types of both parts - text/plain and text/html. + part1 = MIMEText(text, 'plain') + part2 = MIMEText(html, 'html') + + # Attach parts into message container. + # According to RFC 2046, the last part of a multipart message, in this case + # the HTML message, is best and preferred. + msg.attach(part1) + msg.attach(part2) + + return msg + +def send_email(msg, + dry_run=True, + mail=True, + sender=env.get('MAIL_SENDER'), + to=env.get('MAIL_TO', 'elasticsearch@googlegroups.com'), + smtp_server=env.get('SMTP_SERVER', 'localhost')): + msg['From'] = 'Elasticsearch Team <%s>' % sender + msg['To'] = 'Elasticsearch Mailing List <%s>' % to + # save mail on disk + with open(ROOT_DIR+'target/email.txt', 'w') as email_file: + email_file.write(msg.as_string()) + if mail and not dry_run: + s = smtplib.SMTP(smtp_server, 25) + s.sendmail(sender, to, msg.as_string()) + s.quit() + else: + print('generated email: open %starget/email.txt' % ROOT_DIR) + +def print_sonatype_notice(): + settings = os.path.join(os.path.expanduser('~'), '.m2/settings.xml') + if os.path.isfile(settings): + with open(settings, encoding='utf-8') as settings_file: + for line in settings_file: + if line.strip() == 'sonatype-nexus-snapshots': + # moving out - we found the indicator no need to print the warning + return + print(""" + NOTE: No sonatype settings detected, make sure you have configured + your sonatype credentials in '~/.m2/settings.xml': + + + ... + + + sonatype-nexus-snapshots + your-jira-id + your-jira-pwd + + + sonatype-nexus-staging + your-jira-id + your-jira-pwd + + + ... + + """) + +def check_s3_credentials(): + if not env.get('AWS_ACCESS_KEY_ID', None) or not env.get('AWS_SECRET_ACCESS_KEY', None): + raise RuntimeError('Could not find "AWS_ACCESS_KEY_ID" / "AWS_SECRET_ACCESS_KEY" in the env variables please export in order to upload to S3') + +def check_github_credentials(): + if not env.get('GITHUB_KEY', None) and not env.get('GITHUB_LOGIN', None): + log('WARN: Could not find "GITHUB_LOGIN" / "GITHUB_PASSWORD" or "GITHUB_KEY" in the env variables. You could need it.') + +def check_email_settings(): + if not env.get('MAIL_SENDER', None): + raise RuntimeError('Could not find "MAIL_SENDER"') + +# we print a notice if we can not find the relevant infos in the ~/.m2/settings.xml +print_sonatype_notice() + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Builds and publishes a Elasticsearch Plugin Release') + parser.add_argument('--branch', '-b', metavar='master', default=get_current_branch(), + help='The branch to release from. Defaults to the current branch.') + parser.add_argument('--skiptests', '-t', dest='tests', action='store_false', + help='Skips tests before release. Tests are run by default.') + parser.set_defaults(tests=True) + parser.add_argument('--remote', '-r', metavar='origin', default='origin', + help='The remote to push the release commit and tag to. Default is [origin]') + parser.add_argument('--publish', '-p', dest='dryrun', action='store_false', + help='Publishes the release. Disable by default.') + parser.add_argument('--disable_mail', '-dm', dest='mail', action='store_false', + help='Do not send a release email. Email is sent by default.') + + parser.set_defaults(dryrun=True) + parser.set_defaults(mail=True) + args = parser.parse_args() + + src_branch = args.branch + remote = args.remote + run_tests = args.tests + dry_run = args.dryrun + mail = args.mail + + if not dry_run: + check_s3_credentials() + print('WARNING: dryrun is set to "false" - this will push and publish the release') + if mail: + check_email_settings() + print('An email to %s will be sent after the release' + % env.get('MAIL_TO', 'elasticsearch@googlegroups.com')) + input('Press Enter to continue...') + + check_github_credentials() + + print(''.join(['-' for _ in range(80)])) + print('Preparing Release from branch [%s] running tests: [%s] dryrun: [%s]' % (src_branch, run_tests, dry_run)) + print(' JAVA_HOME is [%s]' % JAVA_HOME) + print(' Running with maven command: [%s] ' % (MVN)) + + release_version = find_release_version(src_branch) + artifact_id = find_from_pom('artifactId') + artifact_name = find_from_pom('name') + artifact_description = find_from_pom('description') + project_url = find_from_pom('url') + print(' Artifact Id: [%s]' % artifact_id) + print(' Release version: [%s]' % release_version) + + # extract snapshot + default_snapshot_version = guess_snapshot(release_version) + snapshot_version = input('Enter next snapshot version [%s]:' % default_snapshot_version) + snapshot_version = snapshot_version or default_snapshot_version + + print(' Next version: [%s-SNAPSHOT]' % snapshot_version) + print(' Artifact Name: [%s]' % artifact_name) + print(' Artifact Description: [%s]' % artifact_description) + print(' Project URL: [%s]' % project_url) + + if not dry_run: + smoke_test_version = release_version + head_hash = get_head_hash() + run_mvn('clean') # clean the env! + create_release_branch(remote, src_branch, release_version) + print(' Created release branch [%s]' % (release_branch(release_version))) + success = False + try: + pending_files = [POM_FILE, README_FILE] + remove_maven_snapshot(POM_FILE, release_version) + remove_version_snapshot(README_FILE, release_version) + set_date(README_FILE) + set_install_instructions(README_FILE, artifact_id, release_version) + print(' Done removing snapshot version') + add_pending_files(*pending_files) # expects var args use * to expand + commit_release(artifact_id, release_version) + print(' Committed release version [%s]' % release_version) + print(''.join(['-' for _ in range(80)])) + print('Building Release candidate') + input('Press Enter to continue...') + print(' Checking github issues') + repository = get_github_repository(artifact_id) + check_opened_issues(release_version, repository, artifact_id) + if not dry_run: + print(' Running maven builds now and publish to sonatype - run-tests [%s]' % run_tests) + else: + print(' Running maven builds now run-tests [%s]' % run_tests) + build_release(run_tests=run_tests, dry_run=dry_run) + artifact = get_artifacts(artifact_id, release_version) + artifact_and_checksums = generate_checksums(artifact) + print(''.join(['-' for _ in range(80)])) + + print('Finish Release -- dry_run: %s' % dry_run) + input('Press Enter to continue...') + print(' merge release branch') + git_merge(src_branch, release_version) + print(' tag') + tag_release(release_version) + + add_maven_snapshot(POM_FILE, release_version, snapshot_version) + add_version_snapshot(README_FILE, release_version, snapshot_version) + add_pending_files(*pending_files) + commit_snapshot() + + print(' push to %s %s -- dry_run: %s' % (remote, src_branch, dry_run)) + git_push(remote, src_branch, release_version, dry_run) + print(' publish artifacts to S3 -- dry_run: %s' % dry_run) + publish_artifacts(artifact_and_checksums, base='elasticsearch/%s' % (artifact_id) , dry_run=dry_run) + print(' preparing email (from github issues)') + msg = prepare_email(artifact_id, release_version, repository, artifact_name, artifact_description, project_url) + print(' sending email -- dry_run: %s, mail: %s' % (dry_run, mail)) + send_email(msg, dry_run=dry_run, mail=mail) + + pending_msg = """ +Release successful pending steps: + * close and release sonatype repo: https://oss.sonatype.org/ + * check if the release is there https://oss.sonatype.org/content/repositories/releases/org/elasticsearch/%(artifact_id)s/%(version)s + * tweet about the release +""" + print(pending_msg % {'version': release_version, + 'artifact_id': artifact_id, + 'project_url': project_url}) + success = True + finally: + if not success: + run('git reset --hard HEAD') + run('git checkout %s' % src_branch) + elif dry_run: + print('End of dry_run') + input('Press Enter to reset changes...') + + run('git reset --hard %s' % head_hash) + run('git tag -d v%s' % release_version) + # we delete this one anyways + run('git branch -D %s' % (release_branch(release_version))) diff --git a/dev-tools/upload-s3.py b/dev-tools/upload-s3.py new file mode 100644 index 00000000000..95ea576e65c --- /dev/null +++ b/dev-tools/upload-s3.py @@ -0,0 +1,67 @@ +# Licensed to Elasticsearch under one or more contributor +# license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright +# ownership. Elasticsearch licenses this file to you under +# the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on +# an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +# either express or implied. See the License for the specific +# language governing permissions and limitations under the License. + +import os +import sys +import argparse +try: + import boto.s3 +except: + raise RuntimeError(""" + S3 upload requires boto to be installed + Use one of: + 'pip install -U boto' + 'apt-get install python-boto' + 'easy_install boto' + """) + +import boto.s3 + + +def list_buckets(conn): + return conn.get_all_buckets() + + +def upload_s3(conn, path, key, file, bucket): + print 'Uploading %s to Amazon S3 bucket %s/%s' % \ + (file, bucket, os.path.join(path, key)) + def percent_cb(complete, total): + sys.stdout.write('.') + sys.stdout.flush() + bucket = conn.create_bucket(bucket) + k = bucket.new_key(os.path.join(path, key)) + k.set_contents_from_filename(file, cb=percent_cb, num_cb=100) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Uploads files to Amazon S3') + parser.add_argument('--file', '-f', metavar='path to file', + help='the branch to release from', required=True) + parser.add_argument('--bucket', '-b', metavar='B42', default='download.elasticsearch.org', + help='The S3 Bucket to upload to') + parser.add_argument('--path', '-p', metavar='elasticsearch/elasticsearch', default='elasticsearch/elasticsearch', + help='The key path to use') + parser.add_argument('--key', '-k', metavar='key', default=None, + help='The key - uses the file name as default key') + args = parser.parse_args() + if args.key: + key = args.key + else: + key = os.path.basename(args.file) + + connection = boto.connect_s3() + upload_s3(connection, args.path, key, args.file, args.bucket); + diff --git a/pom.xml b/pom.xml index d40c381fb7b..0d3798de238 100644 --- a/pom.xml +++ b/pom.xml @@ -2,13 +2,14 @@ - elasticsearch-analysis-icu 4.0.0 org.elasticsearch elasticsearch-analysis-icu 2.0.0-SNAPSHOT jar - ICU Analysis for ElasticSearch + ICU Analysis plugin for elasticsearch + The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. + https://github.com/elasticsearch/elasticsearch-analysis-icu/ 2009 From 5a8f3b75a80cea34cb3d50a65defbf66ee888d82 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 28 Feb 2014 23:16:00 +0100 Subject: [PATCH 068/131] Update to Lucene 4.6.1 / Elasticsearch 1.0.0 Closes #18 --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 0d3798de238..a5f444bbdf9 100644 --- a/pom.xml +++ b/pom.xml @@ -32,8 +32,8 @@ - 1.0.0.RC1 - 4.6.0 + 1.0.0 + 4.6.1 1 true onerror From 34a1fab4b9399bcaec4eb89865dfaba136efb0c2 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 28 Feb 2014 23:17:47 +0100 Subject: [PATCH 069/131] prepare release elasticsearch-analysis-icu-2.0.0 --- README.md | 4 ++-- pom.xml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2a898b5fd5a..8ef68cd6250 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,14 @@ ICU Analysis for Elasticsearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.0.0.RC1`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.0.0`. * For 1.0.x elasticsearch versions, look at [master branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/master). * For 0.90.x elasticsearch versions, look at [1.x branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/1.x). | ICU Analysis Plugin | elasticsearch | Release date | |-----------------------------|---------------------|:------------:| -| 2.0.0-SNAPSHOT | 1.0.0.RC1 -> master | XXXX-XX-XX | +| 2.0.0 | 1.0.0.RC1 -> master | 2014-02-28 | | 2.0.0.RC1 | 1.0.0.RC1 -> master | 2014-01-15 | diff --git a/pom.xml b/pom.xml index a5f444bbdf9..abcd226e640 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 2.0.0-SNAPSHOT + 2.0.0 jar ICU Analysis plugin for elasticsearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. From e3f72c61efdc30a710199730b6010fe54843c04b Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 28 Feb 2014 23:20:30 +0100 Subject: [PATCH 070/131] prepare for next development iteration --- README.md | 1 + pom.xml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8ef68cd6250..fa5ec91548f 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ In order to install the plugin, simply run: `bin/plugin -install elasticsearch/e | ICU Analysis Plugin | elasticsearch | Release date | |-----------------------------|---------------------|:------------:| +| 2.1.0-SNAPSHOT | 1.0.0.RC1 -> master | XXXX-XX-XX | | 2.0.0 | 1.0.0.RC1 -> master | 2014-02-28 | | 2.0.0.RC1 | 1.0.0.RC1 -> master | 2014-01-15 | diff --git a/pom.xml b/pom.xml index abcd226e640..a35c7c859bc 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 2.0.0 + 2.1.0-SNAPSHOT jar ICU Analysis plugin for elasticsearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. From 2de0ca1f1abb851f4becb29fc76884f525db42be Mon Sep 17 00:00:00 2001 From: David Pilato Date: Mon, 3 Mar 2014 10:41:25 +0100 Subject: [PATCH 071/131] Update naming for better mailing list announcement By now, when we release the plugin, users can be confused about version naming. For example, email title could be: ``` [ANN] ICU Analysis plugin for elasticsearch 1.8.0 released ``` We prefer to have that form: ``` [ANN] Elasticsearch ICU Analysis plugin 1.8.0 released ``` Thanks to @spinscale to reporting this. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a35c7c859bc..9e33dedd050 100644 --- a/pom.xml +++ b/pom.xml @@ -7,7 +7,7 @@ elasticsearch-analysis-icu 2.1.0-SNAPSHOT jar - ICU Analysis plugin for elasticsearch + Elasticsearch ICU Analysis plugin The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. https://github.com/elasticsearch/elasticsearch-analysis-icu/ 2009 From eac5e1d7e74bf037261386c3ae3478800271e480 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Sun, 9 Mar 2014 10:51:26 +0100 Subject: [PATCH 072/131] Add plugin version in es-plugin.properties With https://github.com/elasticsearch/elasticsearch/issues/2784, we can now add plugin version in `es-plugin.properties` file. It will only be used with elasticsearch 1.0.0 and upper. No need to push it in 1.x branch. Closes #22. --- pom.xml | 6 ++++++ src/main/resources/es-plugin.properties | 1 + 2 files changed, 7 insertions(+) diff --git a/pom.xml b/pom.xml index 9e33dedd050..8c308aa6b05 100644 --- a/pom.xml +++ b/pom.xml @@ -94,6 +94,12 @@ + + + src/main/resources + true + + org.apache.maven.plugins diff --git a/src/main/resources/es-plugin.properties b/src/main/resources/es-plugin.properties index b694c79bec0..cbb41e8bb08 100644 --- a/src/main/resources/es-plugin.properties +++ b/src/main/resources/es-plugin.properties @@ -1 +1,2 @@ plugin=org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin +version=${project.version} From 02df5206e16807ec59198aab987184f3e0c8d4f9 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 19 Mar 2014 22:33:54 +0100 Subject: [PATCH 073/131] Disable java and maven version checking And fix typo in email html --- dev-tools/build_release.py | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/dev-tools/build_release.py b/dev-tools/build_release.py index 9166b09e7e3..74acd8c5f4e 100755 --- a/dev-tools/build_release.py +++ b/dev-tools/build_release.py @@ -43,7 +43,7 @@ from os.path import dirname, abspath The script takes over almost all steps necessary for a release from a high level point of view it does the following things: - - run prerequisite checks ie. check for Java 1.6 being present or S3 credentials available as env variables + - run prerequisite checks ie. check for S3 credentials available as env variables - detect the version to release from the specified branch (--branch) or the current branch - creates a release branch & updates pom.xml and README.md to point to a release version rather than a snapshot - builds the artifacts @@ -109,18 +109,6 @@ def java_exe(): path = JAVA_HOME return 'export JAVA_HOME="%s" PATH="%s/bin:$PATH" JAVACMD="%s/bin/java"' % (path, path, path) -def verify_java_version(version): - s = os.popen('%s; java -version 2>&1' % java_exe()).read() - if s.find(' version "%s.' % version) == -1: - raise RuntimeError('got wrong version for java %s:\n%s' % (version, s)) - -# Verifies the java version. We guarantee that we run with Java 1.6 -# If 1.6 is not available fail the build! -def verify_mvn_java_version(version, mvn): - s = os.popen('%s; %s --version 2>&1' % (java_exe(), mvn)).read() - if s.find('Java version: %s' % version) == -1: - raise RuntimeError('got wrong java version for %s %s:\n%s' % (mvn, version, s)) - # Returns the hash of the current git HEAD revision def get_head_hash(): return os.popen(' git rev-parse --verify HEAD 2>&1').read().strip() @@ -133,9 +121,6 @@ def get_tag_hash(tag): def get_current_branch(): return os.popen('git rev-parse --abbrev-ref HEAD 2>&1').read().strip() -verify_java_version('1.6') # we require to build with 1.6 -verify_mvn_java_version('1.6', MVN) - # Utility that returns the name of the release branch for a given version def release_branch(version): return 'release_branch_%s' % version @@ -218,7 +203,7 @@ def add_version_snapshot(readme_file, release, snapshot): # If we find pattern, we copy the line and replace its content if line.find(pattern) >= 0: return line.replace(pattern, replacement).replace('%s' % (datetime.datetime.now().strftime("%Y-%m-%d")), - 'XXXX-XX-XX')+line + 'XXXX-XX-XX')+line else: return line process_file(readme_file, callback) @@ -356,15 +341,15 @@ def format_issues_html(issues, title='Fix'): if len(issues) > 0: response += '

%s

\n
    \n' % title for issue in issues: - response += '[%s] - %s\n' % (issue.html_url, issue.number, issue.title) + response += '
  • [%s] - %s\n' % (issue.html_url, issue.number, issue.title) response += '
\n' return response def get_github_repository(reponame, - login=env.get('GITHUB_LOGIN', None), - password=env.get('GITHUB_PASSWORD', None), - key=env.get('GITHUB_KEY', None)): + login=env.get('GITHUB_LOGIN', None), + password=env.get('GITHUB_PASSWORD', None), + key=env.get('GITHUB_KEY', None)): if login: g = github3.login(login, password) elif key: From 315a7190ad8b196e2a9c6a00be3cdf7d6241336d Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 26 Mar 2014 16:42:14 +0100 Subject: [PATCH 074/131] Create branches according to elasticsearch versions We create branches: * es-0.90 for elasticsearch 0.90 * es-1.0 for elasticsearch 1.0 * es-1.1 for elasticsearch 1.1 * master for elasticsearch master We also check that before releasing we don't have a dependency to an elasticsearch SNAPSHOT version. Add links to each version in documentation (cherry picked from commit 35f5901) --- README.md | 13 ++++++++----- dev-tools/build_release.py | 29 +++++++++++++++++++++++++++++ pom.xml | 6 +++--- 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index fa5ec91548f..c8bda6d6bdf 100644 --- a/README.md +++ b/README.md @@ -5,15 +5,18 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.0.0`. -* For 1.0.x elasticsearch versions, look at [master branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/master). -* For 0.90.x elasticsearch versions, look at [1.x branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/1.x). +* For master elasticsearch versions, look at [master branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/master). +* For 1.1.x elasticsearch versions, look at [es-1.1 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.1). +* For 1.0.x elasticsearch versions, look at [es-1.0 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.0). +* For 0.90.x elasticsearch versions, look at [es-0.90 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-0.90). | ICU Analysis Plugin | elasticsearch | Release date | |-----------------------------|---------------------|:------------:| -| 2.1.0-SNAPSHOT | 1.0.0.RC1 -> master | XXXX-XX-XX | -| 2.0.0 | 1.0.0.RC1 -> master | 2014-02-28 | -| 2.0.0.RC1 | 1.0.0.RC1 -> master | 2014-01-15 | +| 3.0.0-SNAPSHOT | master | XXXX-XX-XX | +Please read documentation relative to the version you are using: + +* [3.0.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/blob/master/README.md) ICU Normalization ----------------- diff --git a/dev-tools/build_release.py b/dev-tools/build_release.py index 74acd8c5f4e..db8345440c7 100755 --- a/dev-tools/build_release.py +++ b/dev-tools/build_release.py @@ -208,6 +208,29 @@ def add_version_snapshot(readme_file, release, snapshot): return line process_file(readme_file, callback) +# Moves the README.md file from a snapshot to a release (documentation link) +def remove_documentation_snapshot(readme_file, repo_url, release, branch): + pattern = '* [%s-SNAPSHOT](%sblob/%s/README.md)' % (release, repo_url, branch) + replacement = '* [%s](%sblob/v%s/README.md)' % (release, repo_url, release) + def callback(line): + # If we find pattern, we replace its content + if line.find(pattern) >= 0: + return line.replace(pattern, replacement) + else: + return line + process_file(readme_file, callback) + +# Add in README.markdown file the documentation for the next version +def add_documentation_snapshot(readme_file, repo_url, release, snapshot, branch): + pattern = '* [%s](%sblob/v%s/README.md)' % (release, repo_url, release) + replacement = '* [%s-SNAPSHOT](%sblob/%s/README.md)' % (snapshot, repo_url, branch) + def callback(line): + # If we find pattern, we copy the line and replace its content + if line.find(pattern) >= 0: + return line.replace(pattern, replacement)+line + else: + return line + process_file(readme_file, callback) # Set release date in README.md file def set_date(readme_file): @@ -603,8 +626,12 @@ if __name__ == '__main__': artifact_name = find_from_pom('name') artifact_description = find_from_pom('description') project_url = find_from_pom('url') + elasticsearch_version = find_from_pom('elasticsearch.version') print(' Artifact Id: [%s]' % artifact_id) print(' Release version: [%s]' % release_version) + print(' Elasticsearch: [%s]' % elasticsearch_version) + if elasticsearch_version.find('-SNAPSHOT') != -1: + raise RuntimeError('Can not release with a SNAPSHOT elasticsearch dependency: %s' % elasticsearch_version) # extract snapshot default_snapshot_version = guess_snapshot(release_version) @@ -626,6 +653,7 @@ if __name__ == '__main__': try: pending_files = [POM_FILE, README_FILE] remove_maven_snapshot(POM_FILE, release_version) + remove_documentation_snapshot(README_FILE, project_url, release_version, src_branch) remove_version_snapshot(README_FILE, release_version) set_date(README_FILE) set_install_instructions(README_FILE, artifact_id, release_version) @@ -657,6 +685,7 @@ if __name__ == '__main__': add_maven_snapshot(POM_FILE, release_version, snapshot_version) add_version_snapshot(README_FILE, release_version, snapshot_version) + add_documentation_snapshot(README_FILE, project_url, release_version, snapshot_version, src_branch) add_pending_files(*pending_files) commit_snapshot() diff --git a/pom.xml b/pom.xml index 8c308aa6b05..67151c3ba6e 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-icu - 2.1.0-SNAPSHOT + 3.0.0-SNAPSHOT jar Elasticsearch ICU Analysis plugin The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. @@ -32,8 +32,8 @@ - 1.0.0 - 4.6.1 + 2.0.0-SNAPSHOT + 4.7.0 1 true onerror From 84adacef4af25afa624097f7cfe8b18e13881023 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 29 Apr 2014 09:40:29 +0200 Subject: [PATCH 075/131] Upgrade to Lucene 4.8. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 67151c3ba6e..23aeda9e444 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ 2.0.0-SNAPSHOT - 4.7.0 + 4.8.0 1 true onerror From 55bd212e61352d553d0e1972c54c74232fa6af04 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Tue, 29 Apr 2014 10:34:19 +0200 Subject: [PATCH 076/131] Create branch es-1.2 --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c8bda6d6bdf..b2da5be9308 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.0.0`. * For master elasticsearch versions, look at [master branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/master). +* For 1.2.x elasticsearch versions, look at [es-1.1 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.2). * For 1.1.x elasticsearch versions, look at [es-1.1 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.1). * For 1.0.x elasticsearch versions, look at [es-1.0 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.0). * For 0.90.x elasticsearch versions, look at [es-0.90 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-0.90). From dafa7e764def1b1650a5a602fa5efb45a29ca34d Mon Sep 17 00:00:00 2001 From: David Pilato Date: Tue, 29 Apr 2014 10:51:25 +0200 Subject: [PATCH 077/131] Add lucene version in es-plugin.properties Closes #25. (cherry picked from commit 8836775) --- src/main/resources/es-plugin.properties | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/resources/es-plugin.properties b/src/main/resources/es-plugin.properties index cbb41e8bb08..66dd160c014 100644 --- a/src/main/resources/es-plugin.properties +++ b/src/main/resources/es-plugin.properties @@ -1,2 +1,3 @@ plugin=org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin version=${project.version} +lucene=${lucene.version} From f068ef88a4367757781d4c6b7b06f2e5000fe090 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 28 May 2014 15:24:39 +0200 Subject: [PATCH 078/131] Use JS markdown formatter (cherry picked from commit 3941016) --- README.md | 128 +++++++++++++++++++++++++++++------------------------- 1 file changed, 70 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index b2da5be9308..cc528c013cb 100644 --- a/README.md +++ b/README.md @@ -24,36 +24,40 @@ ICU Normalization Normalizes characters as explained [here](http://userguide.icu-project.org/transforms/normalization). It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings. Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`. Here is a sample settings: - { - "index" : { - "analysis" : { - "analyzer" : { - "collation" : { - "tokenizer" : "keyword", - "filter" : ["icu_normalizer"] - } +```js +{ + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "filter" : ["icu_normalizer"] } } } } +} +``` ICU Folding ----------- Folding of unicode characters based on `UTR#30`. It registers itself under `icu_folding` and `icuFolding` names. Sample setting: - { - "index" : { - "analysis" : { - "analyzer" : { - "collation" : { - "tokenizer" : "keyword", - "filter" : ["icu_folding"] - } +```js +{ + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "filter" : ["icu_folding"] } } } } +} +``` ICU Filtering ------------- @@ -64,24 +68,26 @@ language is wanted. See syntax for the UnicodeSet [here](http://icu-project.org/ The Following example exempts Swedish characters from the folding. Note that the filtered characters are NOT lowercased which is why we add that filter below. - { - "index" : { - "analysis" : { - "analyzer" : { - "folding" : { - "tokenizer" : "standard", - "filter" : ["my_icu_folding", "lowercase"] - } +```js +{ + "index" : { + "analysis" : { + "analyzer" : { + "folding" : { + "tokenizer" : "standard", + "filter" : ["my_icu_folding", "lowercase"] } - "filter" : { - "my_icu_folding" : { - "type" : "icu_folding" - "unicodeSetFilter" : "[^åäöÅÄÖ]" - } + } + "filter" : { + "my_icu_folding" : { + "type" : "icu_folding" + "unicodeSetFilter" : "[^åäöÅÄÖ]" } } } } +} +``` ICU Collation ------------- @@ -94,39 +100,43 @@ Uses collation token filter. Allows to either specify the rules for collation Here is a sample settings: - { - "index" : { - "analysis" : { - "analyzer" : { - "collation" : { - "tokenizer" : "keyword", - "filter" : ["icu_collation"] - } +```js +{ + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "filter" : ["icu_collation"] } } } } +} +``` And here is a sample of custom collation: - { - "index" : { - "analysis" : { - "analyzer" : { - "collation" : { - "tokenizer" : "keyword", - "filter" : ["myCollator"] - } - }, - "filter" : { - "myCollator" : { - "type" : "icu_collation", - "language" : "en" - } +```js +{ + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "filter" : ["myCollator"] + } + }, + "filter" : { + "myCollator" : { + "type" : "icu_collation", + "language" : "en" } } } } +} +``` Optional options: * `strength` - The strength property determines the minimum level of difference considered significant during comparison. @@ -159,17 +169,19 @@ ICU Tokenizer Breaks text into words according to [UAX #29: Unicode Text Segmentation](http://www.unicode.org/reports/tr29/). - { - "index" : { - "analysis" : { - "analyzer" : { - "collation" : { - "tokenizer" : "icu_tokenizer", - } +```js +{ + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "icu_tokenizer", } } } } +} +``` License From f1eae455fb8bc12f007245e8f70ece7c95f823a0 Mon Sep 17 00:00:00 2001 From: Jun Ohtani Date: Wed, 28 May 2014 15:46:59 +0200 Subject: [PATCH 079/131] add ICUNormalizer2CharFilter Included ICUNormalizer2Charfilter in Lucene 4.8.0. Add CharFilterFactory. Now, char_filter name is "icu_normalizer", however token_filter name is same name. Closes #27. (cherry picked from commit 0cbf1b3) --- README.md | 25 ++++ .../analysis/IcuAnalysisBinderProcessor.java | 5 + .../IcuNormalizerCharFilterFactory.java | 63 ++++++++++ .../analysis/SimpleIcuAnalysisTests.java | 3 + .../SimpleIcuNormalizerCharFilterTests.java | 109 ++++++++++++++++++ 5 files changed, 205 insertions(+) create mode 100644 src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java create mode 100644 src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java diff --git a/README.md b/README.md index cc528c013cb..0cc22a6b7f8 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,31 @@ Breaks text into words according to [UAX #29: Unicode Text Segmentation](http:// ``` +ICU Normalization CharFilter +----------------- + +Normalizes characters as explained [here](http://userguide.icu-project.org/transforms/normalization). +It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings. +Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`. +Allows for the mode parameter to be provided which can include the following values: `compose` and `decompose`. +Use `decompose` with `nfc` or `nfkc`, to get `nfd` or `nfkd`, respectively. +Here is a sample settings: + +```js +{ + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "char_filter" : ["icu_normalizer"] + } + } + } + } +} +``` + License ------- diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java index f23d32b5c74..8db169b9318 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java @@ -23,6 +23,11 @@ package org.elasticsearch.index.analysis; */ public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { + @Override + public void processCharFilters(CharFiltersBindings charFiltersBindings) { + charFiltersBindings.processCharFilter("icu_normalizer", IcuNormalizerCharFilterFactory.class); + } + @Override public void processTokenizers(TokenizersBindings tokenizersBindings) { tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class); diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java new file mode 100644 index 00000000000..337461c5095 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java @@ -0,0 +1,63 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + + +import com.ibm.icu.text.Normalizer2; +import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.io.Reader; + + +/** + * Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter} to normalize character. + *

+ *

The name can be used to provide the type of normalization to perform.

+ *

The mode can be used to provide 'compose' or 'decompose'. Default is compose.

+ */ +public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory { + + private final String name; + + private final Normalizer2 normalizer; + + + @Inject + public IcuNormalizerCharFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name); + this.name = settings.get("name", "nfkc_cf"); + String mode = settings.get("mode"); + if (!"compose".equals(mode) && !"decompose".equals(mode)) { + mode = "compose"; + } + this.normalizer = Normalizer2.getInstance( + null, this.name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE); + } + + @Override + public Reader create(Reader reader) { + return new ICUNormalizer2CharFilter(reader, normalizer); + } +} diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java index 99f46ebffbe..e12db59c6a8 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -65,5 +65,8 @@ public class SimpleIcuAnalysisTests extends ElasticsearchTestCase { filterFactory = analysisService.tokenFilter("icu_transform"); assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class)); + + CharFilterFactory charFilterFactory = analysisService.charFilter("icu_normalizer"); + assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class)); } } diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java new file mode 100644 index 00000000000..c4cbb945e4c --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java @@ -0,0 +1,109 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import com.ibm.icu.text.Normalizer2; +import org.apache.lucene.analysis.CharFilter; +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.elasticsearch.test.ElasticsearchTestCase; +import org.junit.Test; + +import java.io.StringReader; + +/** + * Test + */ +public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase { + + @Test + public void testDefaultSetting() throws Exception { + + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar"); + + String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि"; + Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE); + String expectedOutput = normalizer.normalize(input); + CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input)); + char[] tempBuff = new char[10]; + StringBuilder output = new StringBuilder(); + while (true) { + int length = inputReader.read(tempBuff); + if (length == -1) break; + output.append(tempBuff, 0, length); + assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length())))); + } + assertEquals(expectedOutput, output.toString()); + } + + + @Test + public void testNameAndModeSetting() throws Exception { + + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer") + .put("index.analysis.char_filter.myNormalizerChar.name", "nfkc") + .put("index.analysis.char_filter.myNormalizerChar.mode", "decompose") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar"); + + String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि"; + Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE); + String expectedOutput = normalizer.normalize(input); + CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input)); + char[] tempBuff = new char[10]; + StringBuilder output = new StringBuilder(); + while (true) { + int length = inputReader.read(tempBuff); + if (length == -1) break; + output.append(tempBuff, 0, length); + assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length())))); + } + assertEquals(expectedOutput, output.toString()); + } + + private AnalysisService createAnalysisService(Index index, Settings settings) { + Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(index, settings), + new IndexNameModule(index), + new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor())) + .createChildInjector(parentInjector); + + return injector.getInstance(AnalysisService.class); + } +} From 5523739b20f34ea285f424b7f1c04dc74b96bf9f Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 28 May 2014 16:50:27 +0200 Subject: [PATCH 080/131] Add integration tests Closes #31. (cherry picked from commit 39b83f0) --- .../index/analysis/ICUIntegrationTests.java | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java diff --git a/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java b/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java new file mode 100644 index 00000000000..23c12d137de --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java @@ -0,0 +1,111 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse; +import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.test.ElasticsearchIntegrationTest; +import org.junit.Test; + +import java.io.IOException; +import java.util.concurrent.ExecutionException; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.CoreMatchers.notNullValue; + +@ElasticsearchIntegrationTest.ClusterScope(scope = ElasticsearchIntegrationTest.Scope.SUITE) +public class ICUIntegrationTests extends ElasticsearchIntegrationTest { + + @Override + public Settings indexSettings() { + Settings settings = ImmutableSettings.builder() + .put(super.indexSettings()) + .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") + .putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_collator") + .put("index.analysis.filter.my_collator.type", "icu_collation") + .put("index.analysis.filter.my_collator.language", "en") + .put("index.analysis.filter.my_collator.strength", "primary") + .build(); + + return settings; + } + + @Test + public void testICUAnalyzer() throws ExecutionException, InterruptedException { + createIndex("test"); + ensureGreen("test"); + AnalyzeResponse response1 = client().admin().indices() + .prepareAnalyze("Bâton enflammé") + .setIndex("test") + .setAnalyzer("my_analyzer") + .execute().get(); + AnalyzeResponse response2 = client().admin().indices() + .prepareAnalyze("baton enflamme") + .setIndex("test") + .setAnalyzer("my_analyzer") + .execute().get(); + + assertThat(response1, notNullValue()); + assertThat(response2, notNullValue()); + assertThat(response1.getTokens().size(), is(response2.getTokens().size())); + + for (int i = 0; i < response2.getTokens().size(); i++) { + assertThat(response1.getTokens().get(i).getTerm(), is(response2.getTokens().get(i).getTerm())); + } + } + + @Test + public void testICUAnalyzerInMapping() throws ExecutionException, InterruptedException, IOException { + createIndex("test"); + ensureGreen("test"); + final XContentBuilder mapping = jsonBuilder().startObject() + .startObject("type") + .startObject("properties") + .startObject("foo") + .field("type", "string") + .field("analyzer", "my_analyzer") + .endObject() + .endObject() + .endObject() + .endObject(); + + client().admin().indices().preparePutMapping("test").setType("type").setSource(mapping).get(); + + index("test", "type", "1", "foo", "Bâton enflammé"); + refresh(); + + SearchResponse response = client().prepareSearch("test").setQuery( + QueryBuilders.matchQuery("foo", "baton enflamme") + ).execute().actionGet(); + + assertThat(response.getHits().getTotalHits(), is(1L)); + } + + @Test + public void testPluginIsLoaded() { + NodesInfoResponse infos = client().admin().cluster().prepareNodesInfo().setPlugins(true).execute().actionGet(); + assertThat(infos.getNodes()[0].getPlugins().getInfos().get(0).getName(), is("analysis-icu")); + } +} From 8300ea17ad97fc54f2bdaea53f5b1166b8eb79eb Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 28 May 2014 16:51:47 +0200 Subject: [PATCH 081/131] Update to elasticsearch 1.3.0 Closes #32. (cherry picked from commit 4d17e47) --- README.md | 5 +++-- pom.xml | 2 +- .../index/analysis/ICUIntegrationTests.java | 9 +++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0cc22a6b7f8..ae4378cd94a 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,11 @@ ICU Analysis for Elasticsearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.0.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.1.0`. * For master elasticsearch versions, look at [master branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/master). -* For 1.2.x elasticsearch versions, look at [es-1.1 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.2). +* For 1.3.x elasticsearch versions, look at [es-1.3 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.3). +* For 1.2.x elasticsearch versions, look at [es-1.2 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.2). * For 1.1.x elasticsearch versions, look at [es-1.1 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.1). * For 1.0.x elasticsearch versions, look at [es-1.0 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.0). * For 0.90.x elasticsearch versions, look at [es-0.90 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-0.90). diff --git a/pom.xml b/pom.xml index 23aeda9e444..fcd6493ab31 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ 2.0.0-SNAPSHOT - 4.8.0 + 4.8.1 1 true onerror diff --git a/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java b/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java index 23c12d137de..95874c98b07 100644 --- a/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java @@ -25,6 +25,7 @@ import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.plugins.PluginsService; import org.elasticsearch.test.ElasticsearchIntegrationTest; import org.junit.Test; @@ -38,6 +39,14 @@ import static org.hamcrest.CoreMatchers.notNullValue; @ElasticsearchIntegrationTest.ClusterScope(scope = ElasticsearchIntegrationTest.Scope.SUITE) public class ICUIntegrationTests extends ElasticsearchIntegrationTest { + @Override + protected Settings nodeSettings(int nodeOrdinal) { + return ImmutableSettings.builder() + .put(super.nodeSettings(nodeOrdinal)) + .put("plugins." + PluginsService.LOAD_PLUGIN_FROM_CLASSPATH, true) + .build(); + } + @Override public Settings indexSettings() { Settings settings = ImmutableSettings.builder() From 35bf4adadaf09d6c2309070eaa9b5db28a0ee8e8 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 27 Jun 2014 11:50:26 +0200 Subject: [PATCH 082/131] Update to Lucene 4.9.0 Closes #33. (cherry picked from commit bc45a89) --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index fcd6493ab31..6ddf5431919 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ 2.0.0-SNAPSHOT - 4.8.1 + 4.9.0 1 true onerror From fae3094764719cdfca208ea8e258f5a8e40789de Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 16 Jul 2014 09:24:37 +0200 Subject: [PATCH 083/131] Update to elasticsearch 1.4.0 Closes #34. (cherry picked from commit ddd941b) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ae4378cd94a..5db24454d9a 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.1.0`. * For master elasticsearch versions, look at [master branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/master). +* For 1.4.x elasticsearch versions, look at [es-1.4 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.4). * For 1.3.x elasticsearch versions, look at [es-1.3 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.3). * For 1.2.x elasticsearch versions, look at [es-1.2 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.2). * For 1.1.x elasticsearch versions, look at [es-1.1 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.1). From cbb00e997805ba81418afecc5155153dafd23e44 Mon Sep 17 00:00:00 2001 From: Jun Ohtani Date: Fri, 8 Aug 2014 12:03:18 +0900 Subject: [PATCH 084/131] change version number for 2.3.0 release --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5db24454d9a..dd899b256e6 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ICU Analysis for Elasticsearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.1.0`. +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.3.0`. * For master elasticsearch versions, look at [master branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/master). * For 1.4.x elasticsearch versions, look at [es-1.4 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.4). From e8d23c912984c454d136acb0a566cc8ccd69a39a Mon Sep 17 00:00:00 2001 From: Jun Ohtani Date: Tue, 26 Aug 2014 14:04:37 +0900 Subject: [PATCH 085/131] Docs: make the welcome page more obvious Closes #36. --- .gitignore | 1 + README.md | 34 +- dev-tools/build_release.py | 722 ------------------------------------- dev-tools/release.py | 134 +++++++ dev-tools/upload-s3.py | 67 ---- 5 files changed, 156 insertions(+), 802 deletions(-) delete mode 100755 dev-tools/build_release.py create mode 100644 dev-tools/release.py delete mode 100644 dev-tools/upload-s3.py diff --git a/.gitignore b/.gitignore index de7d76e25b7..a99aad1be2e 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ /.classpath /.project /.local-execution-hints.log +/plugin_tools diff --git a/README.md b/README.md index dd899b256e6..a5060ba7545 100644 --- a/README.md +++ b/README.md @@ -3,23 +3,31 @@ ICU Analysis for Elasticsearch The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. -In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.3.0`. +In order to install the plugin, simply run: -* For master elasticsearch versions, look at [master branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/master). -* For 1.4.x elasticsearch versions, look at [es-1.4 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.4). -* For 1.3.x elasticsearch versions, look at [es-1.3 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.3). -* For 1.2.x elasticsearch versions, look at [es-1.2 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.2). -* For 1.1.x elasticsearch versions, look at [es-1.1 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.1). -* For 1.0.x elasticsearch versions, look at [es-1.0 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.0). -* For 0.90.x elasticsearch versions, look at [es-0.90 branch](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-0.90). +```sh +bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.3.0 +``` -| ICU Analysis Plugin | elasticsearch | Release date | -|-----------------------------|---------------------|:------------:| -| 3.0.0-SNAPSHOT | master | XXXX-XX-XX | -Please read documentation relative to the version you are using: +| elasticsearch | ICU Analysis Plugin | Docs | +|---------------|-----------------------|------------| +| master | Build from source | See below | +| es-1.x | Build from source | [2.4.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.x/#version-240-snapshot-for-elasticsearch-1x) | +| es-1.3 | 2.3.0 | [2.3.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.3.0/#icu-analysis-for-elasticsearch) | +| es-1.2 | 2.2.0 | [2.2.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.2.0/#icu-analysis-for-elasticsearch) | +| es-1.1 | 2.1.0 | [2.1.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.1.0/#icu-analysis-for-elasticsearch) | +| es-1.0 | 2.0.0 | [2.0.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.0.0/#icu-analysis-for-elasticsearch) | +| es-0.90 | 1.13.0 | [1.13.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v1.13.0/#icu-analysis-for-elasticsearch) | + +To build a `SNAPSHOT` version, you need to build it with Maven: + +```bash +mvn clean install +plugin --install analysis-icu \ + --url file:target/releases/elasticsearch-analysis-icu-X.X.X-SNAPSHOT.zip +``` -* [3.0.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/blob/master/README.md) ICU Normalization ----------------- diff --git a/dev-tools/build_release.py b/dev-tools/build_release.py deleted file mode 100755 index db8345440c7..00000000000 --- a/dev-tools/build_release.py +++ /dev/null @@ -1,722 +0,0 @@ -# Licensed to Elasticsearch under one or more contributor -# license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright -# ownership. Elasticsearch licenses this file to you under -# the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on -# an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, -# either express or implied. See the License for the specific -# language governing permissions and limitations under the License. - -import re -import tempfile -import shutil -import os -import datetime -import argparse -import github3 -import smtplib - -from email.mime.multipart import MIMEMultipart -from email.mime.text import MIMEText - -from os.path import dirname, abspath - -""" - This tool builds a release from the a given elasticsearch plugin branch. - In order to execute it go in the top level directory and run: - $ python3 dev_tools/build_release.py --branch master --publish --remote origin - - By default this script runs in 'dry' mode which essentially simulates a release. If the - '--publish' option is set the actual release is done. - If not in 'dry' mode, a mail will be automatically sent to the mailing list. - You can disable it with the option '--disable_mail' - - $ python3 dev_tools/build_release.py --publish --remote origin --disable_mail - - The script takes over almost all - steps necessary for a release from a high level point of view it does the following things: - - - run prerequisite checks ie. check for S3 credentials available as env variables - - detect the version to release from the specified branch (--branch) or the current branch - - creates a release branch & updates pom.xml and README.md to point to a release version rather than a snapshot - - builds the artifacts - - commits the new version and merges the release branch into the source branch - - creates a tag and pushes the commit to the specified origin (--remote) - - publishes the releases to sonatype and S3 - - send a mail based on github issues fixed by this version - -Once it's done it will print all the remaining steps. - - Prerequisites: - - Python 3k for script execution - - Boto for S3 Upload ($ apt-get install python-boto or pip-3.3 install boto) - - github3 module (pip-3.3 install github3.py) - - S3 keys exported via ENV Variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) - - GITHUB (login/password) or key exported via ENV Variables (GITHUB_LOGIN, GITHUB_PASSWORD or GITHUB_KEY) - (see https://github.com/settings/applications#personal-access-tokens) - Optional: default to no authentication - - SMTP_HOST - Optional: default to localhost - - MAIL_SENDER - Optional: default to 'david@pilato.fr': must be authorized to send emails to elasticsearch mailing list - - MAIL_TO - Optional: default to 'elasticsearch@googlegroups.com' -""" -env = os.environ - -LOG = env.get('ES_RELEASE_LOG', '/tmp/elasticsearch_release.log') -ROOT_DIR = os.path.join(abspath(dirname(__file__)), '../') -README_FILE = ROOT_DIR + 'README.md' -POM_FILE = ROOT_DIR + 'pom.xml' - -def log(msg): - log_plain('\n%s' % msg) - -def log_plain(msg): - f = open(LOG, mode='ab') - f.write(msg.encode('utf-8')) - f.close() - -def run(command, quiet=False): - log('%s: RUN: %s\n' % (datetime.datetime.now(), command)) - if os.system('%s >> %s 2>&1' % (command, LOG)): - msg = ' FAILED: %s [see log %s]' % (command, LOG) - if not quiet: - print(msg) - raise RuntimeError(msg) - -try: - JAVA_HOME = env['JAVA_HOME'] -except KeyError: - raise RuntimeError(""" - Please set JAVA_HOME in the env before running release tool - On OSX use: export JAVA_HOME=`/usr/libexec/java_home -v '1.6*'`""") - -try: - MVN='mvn' - # make sure mvn3 is used if mvn3 is available - # some systems use maven 2 as default - run('mvn3 --version', quiet=True) - MVN='mvn3' -except RuntimeError: - pass - - -def java_exe(): - path = JAVA_HOME - return 'export JAVA_HOME="%s" PATH="%s/bin:$PATH" JAVACMD="%s/bin/java"' % (path, path, path) - -# Returns the hash of the current git HEAD revision -def get_head_hash(): - return os.popen(' git rev-parse --verify HEAD 2>&1').read().strip() - -# Returns the hash of the given tag revision -def get_tag_hash(tag): - return os.popen('git show-ref --tags %s --hash 2>&1' % (tag)).read().strip() - -# Returns the name of the current branch -def get_current_branch(): - return os.popen('git rev-parse --abbrev-ref HEAD 2>&1').read().strip() - -# Utility that returns the name of the release branch for a given version -def release_branch(version): - return 'release_branch_%s' % version - -# runs get fetch on the given remote -def fetch(remote): - run('git fetch %s' % remote) - -# Creates a new release branch from the given source branch -# and rebases the source branch from the remote before creating -# the release branch. Note: This fails if the source branch -# doesn't exist on the provided remote. -def create_release_branch(remote, src_branch, release): - run('git checkout %s' % src_branch) - run('git pull --rebase %s %s' % (remote, src_branch)) - run('git checkout -b %s' % (release_branch(release))) - - -# Reads the given file and applies the -# callback to it. If the callback changed -# a line the given file is replaced with -# the modified input. -def process_file(file_path, line_callback): - fh, abs_path = tempfile.mkstemp() - modified = False - with open(abs_path,'w', encoding='utf-8') as new_file: - with open(file_path, encoding='utf-8') as old_file: - for line in old_file: - new_line = line_callback(line) - modified = modified or (new_line != line) - new_file.write(new_line) - os.close(fh) - if modified: - #Remove original file - os.remove(file_path) - #Move new file - shutil.move(abs_path, file_path) - return True - else: - # nothing to do - just remove the tmp file - os.remove(abs_path) - return False - -# Guess the next snapshot version number (increment second digit) -def guess_snapshot(version): - digits=list(map(int, re.findall(r'\d+', version))) - source='%s.%s' % (digits[0], digits[1]) - destination='%s.%s' % (digits[0], digits[1]+1) - return version.replace(source, destination) - -# Moves the pom.xml file from a snapshot to a release -def remove_maven_snapshot(pom, release): - pattern = '%s-SNAPSHOT' % release - replacement = '%s' % release - def callback(line): - return line.replace(pattern, replacement) - process_file(pom, callback) - -# Moves the README.md file from a snapshot to a release -def remove_version_snapshot(readme_file, release): - pattern = '%s-SNAPSHOT' % release - replacement = '%s ' % release - def callback(line): - return line.replace(pattern, replacement) - process_file(readme_file, callback) - -# Moves the pom.xml file to the next snapshot -def add_maven_snapshot(pom, release, snapshot): - pattern = '%s' % release - replacement = '%s-SNAPSHOT' % snapshot - def callback(line): - return line.replace(pattern, replacement) - process_file(pom, callback) - -# Add in README.md file the next snapshot -def add_version_snapshot(readme_file, release, snapshot): - pattern = '| %s ' % release - replacement = '| %s-SNAPSHOT' % snapshot - def callback(line): - # If we find pattern, we copy the line and replace its content - if line.find(pattern) >= 0: - return line.replace(pattern, replacement).replace('%s' % (datetime.datetime.now().strftime("%Y-%m-%d")), - 'XXXX-XX-XX')+line - else: - return line - process_file(readme_file, callback) - -# Moves the README.md file from a snapshot to a release (documentation link) -def remove_documentation_snapshot(readme_file, repo_url, release, branch): - pattern = '* [%s-SNAPSHOT](%sblob/%s/README.md)' % (release, repo_url, branch) - replacement = '* [%s](%sblob/v%s/README.md)' % (release, repo_url, release) - def callback(line): - # If we find pattern, we replace its content - if line.find(pattern) >= 0: - return line.replace(pattern, replacement) - else: - return line - process_file(readme_file, callback) - -# Add in README.markdown file the documentation for the next version -def add_documentation_snapshot(readme_file, repo_url, release, snapshot, branch): - pattern = '* [%s](%sblob/v%s/README.md)' % (release, repo_url, release) - replacement = '* [%s-SNAPSHOT](%sblob/%s/README.md)' % (snapshot, repo_url, branch) - def callback(line): - # If we find pattern, we copy the line and replace its content - if line.find(pattern) >= 0: - return line.replace(pattern, replacement)+line - else: - return line - process_file(readme_file, callback) - -# Set release date in README.md file -def set_date(readme_file): - pattern = 'XXXX-XX-XX' - replacement = '%s' % (datetime.datetime.now().strftime("%Y-%m-%d")) - def callback(line): - return line.replace(pattern, replacement) - process_file(readme_file, callback) - -# Update installation instructions in README.md file -def set_install_instructions(readme_file, artifact_name, release): - pattern = '`bin/plugin -install elasticsearch/%s/.+`' % artifact_name - replacement = '`bin/plugin -install elasticsearch/%s/%s`' % (artifact_name, release) - def callback(line): - return re.sub(pattern, replacement, line) - process_file(readme_file, callback) - - -# Stages the given files for the next git commit -def add_pending_files(*files): - for file in files: - run('git add %s' % file) - -# Executes a git commit with 'release [version]' as the commit message -def commit_release(artifact_id, release): - run('git commit -m "prepare release %s-%s"' % (artifact_id, release)) - -def commit_snapshot(): - run('git commit -m "prepare for next development iteration"') - -def tag_release(release): - run('git tag -a v%s -m "Tag release version %s"' % (release, release)) - -def run_mvn(*cmd): - for c in cmd: - run('%s; %s -f %s %s' % (java_exe(), MVN, POM_FILE, c)) - -def build_release(run_tests=False, dry_run=True): - target = 'deploy' - if dry_run: - target = 'package' - if run_tests: - run_mvn('clean test') - run_mvn('clean %s -DskipTests' %(target)) - -# Checks the pom.xml for the release version. 2.0.0-SNAPSHOT -# This method fails if the pom file has no SNAPSHOT version set ie. -# if the version is already on a release version we fail. -# Returns the next version string ie. 0.90.7 -def find_release_version(src_branch): - run('git checkout %s' % src_branch) - with open(POM_FILE, encoding='utf-8') as file: - for line in file: - match = re.search(r'(.+)-SNAPSHOT', line) - if match: - return match.group(1) - raise RuntimeError('Could not find release version in branch %s' % src_branch) - -# extract a value from pom.xml -def find_from_pom(tag): - with open(POM_FILE, encoding='utf-8') as file: - for line in file: - match = re.search(r'<%s>(.+)' % (tag, tag), line) - if match: - return match.group(1) - raise RuntimeError('Could not find <%s> in pom.xml file' % (tag)) - -def get_artifacts(artifact_id, release): - artifact_path = ROOT_DIR + 'target/releases/%s-%s.zip' % (artifact_id, release) - print(' Path %s' % (artifact_path)) - if not os.path.isfile(artifact_path): - raise RuntimeError('Could not find required artifact at %s' % (artifact_path)) - return artifact_path - -# Generates sha1 for a file -# and returns the checksum files as well -# as the given files in a list -def generate_checksums(release_file): - res = [] - directory = os.path.dirname(release_file) - file = os.path.basename(release_file) - checksum_file = '%s.sha1.txt' % file - - if os.system('cd %s; shasum %s > %s' % (directory, file, checksum_file)): - raise RuntimeError('Failed to generate checksum for file %s' % release_file) - res = res + [os.path.join(directory, checksum_file), release_file] - return res - -def git_merge(src_branch, release_version): - run('git checkout %s' % src_branch) - run('git merge %s' % release_branch(release_version)) - -def git_push(remote, src_branch, release_version, dry_run): - if not dry_run: - run('git push %s %s' % (remote, src_branch)) # push the commit - run('git push %s v%s' % (remote, release_version)) # push the tag - else: - print(' dryrun [True] -- skipping push to remote %s' % remote) - -def publish_artifacts(artifacts, base='elasticsearch/elasticsearch', dry_run=True): - location = os.path.dirname(os.path.realpath(__file__)) - for artifact in artifacts: - if dry_run: - print('Skip Uploading %s to Amazon S3 in %s' % (artifact, base)) - else: - print('Uploading %s to Amazon S3' % artifact) - # requires boto to be installed but it is not available on python3k yet so we use a dedicated tool - run('python %s/upload-s3.py --file %s --path %s' % (location, os.path.abspath(artifact), base)) - - -################# -## -## -## Email and Github Management -## -## -################# -def format_issues_plain(issues, title='Fix'): - response = "" - - if len(issues) > 0: - response += '%s:\n' % title - for issue in issues: - response += ' * [%s] - %s (%s)\n' % (issue.number, issue.title, issue.html_url) - - return response - -def format_issues_html(issues, title='Fix'): - response = "" - - if len(issues) > 0: - response += '

%s

\n
    \n' % title - for issue in issues: - response += '
  • [%s] - %s\n' % (issue.html_url, issue.number, issue.title) - response += '
\n' - - return response - -def get_github_repository(reponame, - login=env.get('GITHUB_LOGIN', None), - password=env.get('GITHUB_PASSWORD', None), - key=env.get('GITHUB_KEY', None)): - if login: - g = github3.login(login, password) - elif key: - g = github3.login(token=key) - else: - g = github3.GitHub() - - return g.repository("elasticsearch", reponame) - -# Check if there are some remaining open issues and fails -def check_opened_issues(version, repository, reponame): - opened_issues = [i for i in repository.iter_issues(state='open', labels='%s' % version)] - if len(opened_issues)>0: - raise NameError('Some issues [%s] are still opened. Check https://github.com/elasticsearch/%s/issues?labels=%s&state=open' - % (len(opened_issues), reponame, version)) - -# List issues from github: can be done anonymously if you don't -# exceed a given number of github API calls per day -# Check if there are some remaining open issues and fails -def list_issues(version, - repository, - severity='bug'): - issues = [i for i in repository.iter_issues(state='closed', labels='%s,%s' % (severity, version))] - return issues - -# Get issues from github and generates a Plain/HTML Multipart email -# And send it if dry_run=False -def prepare_email(artifact_id, release_version, repository, - artifact_name, artifact_description, project_url, - severity_labels_bug='bug', - severity_labels_update='update', - severity_labels_new='new', - severity_labels_doc='doc'): - - ## Get bugs from github - issues_bug = list_issues(release_version, repository, severity=severity_labels_bug) - issues_update = list_issues(release_version, repository, severity=severity_labels_update) - issues_new = list_issues(release_version, repository, severity=severity_labels_new) - issues_doc = list_issues(release_version, repository, severity=severity_labels_doc) - - ## Format content to plain text - plain_issues_bug = format_issues_plain(issues_bug, 'Fix') - plain_issues_update = format_issues_plain(issues_update, 'Update') - plain_issues_new = format_issues_plain(issues_new, 'New') - plain_issues_doc = format_issues_plain(issues_doc, 'Doc') - - ## Format content to html - html_issues_bug = format_issues_html(issues_bug, 'Fix') - html_issues_update = format_issues_html(issues_update, 'Update') - html_issues_new = format_issues_html(issues_new, 'New') - html_issues_doc = format_issues_html(issues_doc, 'Doc') - - if len(issues_bug)+len(issues_update)+len(issues_new)+len(issues_doc) > 0: - plain_empty_message = "" - html_empty_message = "" - - else: - plain_empty_message = "No issue listed for this release" - html_empty_message = "

No issue listed for this release

" - - msg = MIMEMultipart('alternative') - msg['Subject'] = '[ANN] %s %s released' % (artifact_name, release_version) - text = """ -Heya, - - -We are pleased to announce the release of the %(artifact_name)s, version %(release_version)s. - -%(artifact_description)s. - -%(project_url)s - -Release Notes - %(artifact_id)s - Version %(release_version)s - -%(empty_message)s -%(issues_bug)s -%(issues_update)s -%(issues_new)s -%(issues_doc)s - -Issues, Pull requests, Feature requests are warmly welcome on %(artifact_id)s project repository: %(project_url)s -For questions or comments around this plugin, feel free to use elasticsearch mailing list: https://groups.google.com/forum/#!forum/elasticsearch - -Enjoy, - --The Elasticsearch team -""" % {'release_version': release_version, - 'artifact_id': artifact_id, - 'artifact_name': artifact_name, - 'artifact_description': artifact_description, - 'project_url': project_url, - 'empty_message': plain_empty_message, - 'issues_bug': plain_issues_bug, - 'issues_update': plain_issues_update, - 'issues_new': plain_issues_new, - 'issues_doc': plain_issues_doc} - - html = """ - - -

Heya,

- -

We are pleased to announce the release of the %(artifact_name)s, version %(release_version)s

- -
%(artifact_description)s.
- -

Release Notes - Version %(release_version)s

-%(empty_message)s -%(issues_bug)s -%(issues_update)s -%(issues_new)s -%(issues_doc)s - -

Issues, Pull requests, Feature requests are warmly welcome on -%(artifact_id)s project repository!

-

For questions or comments around this plugin, feel free to use elasticsearch -mailing list!

- -

Enjoy,

- -

- The Elasticsearch team

- -""" % {'release_version': release_version, - 'artifact_id': artifact_id, - 'artifact_name': artifact_name, - 'artifact_description': artifact_description, - 'project_url': project_url, - 'empty_message': html_empty_message, - 'issues_bug': html_issues_bug, - 'issues_update': html_issues_update, - 'issues_new': html_issues_new, - 'issues_doc': html_issues_doc} - - # Record the MIME types of both parts - text/plain and text/html. - part1 = MIMEText(text, 'plain') - part2 = MIMEText(html, 'html') - - # Attach parts into message container. - # According to RFC 2046, the last part of a multipart message, in this case - # the HTML message, is best and preferred. - msg.attach(part1) - msg.attach(part2) - - return msg - -def send_email(msg, - dry_run=True, - mail=True, - sender=env.get('MAIL_SENDER'), - to=env.get('MAIL_TO', 'elasticsearch@googlegroups.com'), - smtp_server=env.get('SMTP_SERVER', 'localhost')): - msg['From'] = 'Elasticsearch Team <%s>' % sender - msg['To'] = 'Elasticsearch Mailing List <%s>' % to - # save mail on disk - with open(ROOT_DIR+'target/email.txt', 'w') as email_file: - email_file.write(msg.as_string()) - if mail and not dry_run: - s = smtplib.SMTP(smtp_server, 25) - s.sendmail(sender, to, msg.as_string()) - s.quit() - else: - print('generated email: open %starget/email.txt' % ROOT_DIR) - -def print_sonatype_notice(): - settings = os.path.join(os.path.expanduser('~'), '.m2/settings.xml') - if os.path.isfile(settings): - with open(settings, encoding='utf-8') as settings_file: - for line in settings_file: - if line.strip() == 'sonatype-nexus-snapshots': - # moving out - we found the indicator no need to print the warning - return - print(""" - NOTE: No sonatype settings detected, make sure you have configured - your sonatype credentials in '~/.m2/settings.xml': - - - ... - - - sonatype-nexus-snapshots - your-jira-id - your-jira-pwd - - - sonatype-nexus-staging - your-jira-id - your-jira-pwd - - - ... - - """) - -def check_s3_credentials(): - if not env.get('AWS_ACCESS_KEY_ID', None) or not env.get('AWS_SECRET_ACCESS_KEY', None): - raise RuntimeError('Could not find "AWS_ACCESS_KEY_ID" / "AWS_SECRET_ACCESS_KEY" in the env variables please export in order to upload to S3') - -def check_github_credentials(): - if not env.get('GITHUB_KEY', None) and not env.get('GITHUB_LOGIN', None): - log('WARN: Could not find "GITHUB_LOGIN" / "GITHUB_PASSWORD" or "GITHUB_KEY" in the env variables. You could need it.') - -def check_email_settings(): - if not env.get('MAIL_SENDER', None): - raise RuntimeError('Could not find "MAIL_SENDER"') - -# we print a notice if we can not find the relevant infos in the ~/.m2/settings.xml -print_sonatype_notice() - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Builds and publishes a Elasticsearch Plugin Release') - parser.add_argument('--branch', '-b', metavar='master', default=get_current_branch(), - help='The branch to release from. Defaults to the current branch.') - parser.add_argument('--skiptests', '-t', dest='tests', action='store_false', - help='Skips tests before release. Tests are run by default.') - parser.set_defaults(tests=True) - parser.add_argument('--remote', '-r', metavar='origin', default='origin', - help='The remote to push the release commit and tag to. Default is [origin]') - parser.add_argument('--publish', '-p', dest='dryrun', action='store_false', - help='Publishes the release. Disable by default.') - parser.add_argument('--disable_mail', '-dm', dest='mail', action='store_false', - help='Do not send a release email. Email is sent by default.') - - parser.set_defaults(dryrun=True) - parser.set_defaults(mail=True) - args = parser.parse_args() - - src_branch = args.branch - remote = args.remote - run_tests = args.tests - dry_run = args.dryrun - mail = args.mail - - if not dry_run: - check_s3_credentials() - print('WARNING: dryrun is set to "false" - this will push and publish the release') - if mail: - check_email_settings() - print('An email to %s will be sent after the release' - % env.get('MAIL_TO', 'elasticsearch@googlegroups.com')) - input('Press Enter to continue...') - - check_github_credentials() - - print(''.join(['-' for _ in range(80)])) - print('Preparing Release from branch [%s] running tests: [%s] dryrun: [%s]' % (src_branch, run_tests, dry_run)) - print(' JAVA_HOME is [%s]' % JAVA_HOME) - print(' Running with maven command: [%s] ' % (MVN)) - - release_version = find_release_version(src_branch) - artifact_id = find_from_pom('artifactId') - artifact_name = find_from_pom('name') - artifact_description = find_from_pom('description') - project_url = find_from_pom('url') - elasticsearch_version = find_from_pom('elasticsearch.version') - print(' Artifact Id: [%s]' % artifact_id) - print(' Release version: [%s]' % release_version) - print(' Elasticsearch: [%s]' % elasticsearch_version) - if elasticsearch_version.find('-SNAPSHOT') != -1: - raise RuntimeError('Can not release with a SNAPSHOT elasticsearch dependency: %s' % elasticsearch_version) - - # extract snapshot - default_snapshot_version = guess_snapshot(release_version) - snapshot_version = input('Enter next snapshot version [%s]:' % default_snapshot_version) - snapshot_version = snapshot_version or default_snapshot_version - - print(' Next version: [%s-SNAPSHOT]' % snapshot_version) - print(' Artifact Name: [%s]' % artifact_name) - print(' Artifact Description: [%s]' % artifact_description) - print(' Project URL: [%s]' % project_url) - - if not dry_run: - smoke_test_version = release_version - head_hash = get_head_hash() - run_mvn('clean') # clean the env! - create_release_branch(remote, src_branch, release_version) - print(' Created release branch [%s]' % (release_branch(release_version))) - success = False - try: - pending_files = [POM_FILE, README_FILE] - remove_maven_snapshot(POM_FILE, release_version) - remove_documentation_snapshot(README_FILE, project_url, release_version, src_branch) - remove_version_snapshot(README_FILE, release_version) - set_date(README_FILE) - set_install_instructions(README_FILE, artifact_id, release_version) - print(' Done removing snapshot version') - add_pending_files(*pending_files) # expects var args use * to expand - commit_release(artifact_id, release_version) - print(' Committed release version [%s]' % release_version) - print(''.join(['-' for _ in range(80)])) - print('Building Release candidate') - input('Press Enter to continue...') - print(' Checking github issues') - repository = get_github_repository(artifact_id) - check_opened_issues(release_version, repository, artifact_id) - if not dry_run: - print(' Running maven builds now and publish to sonatype - run-tests [%s]' % run_tests) - else: - print(' Running maven builds now run-tests [%s]' % run_tests) - build_release(run_tests=run_tests, dry_run=dry_run) - artifact = get_artifacts(artifact_id, release_version) - artifact_and_checksums = generate_checksums(artifact) - print(''.join(['-' for _ in range(80)])) - - print('Finish Release -- dry_run: %s' % dry_run) - input('Press Enter to continue...') - print(' merge release branch') - git_merge(src_branch, release_version) - print(' tag') - tag_release(release_version) - - add_maven_snapshot(POM_FILE, release_version, snapshot_version) - add_version_snapshot(README_FILE, release_version, snapshot_version) - add_documentation_snapshot(README_FILE, project_url, release_version, snapshot_version, src_branch) - add_pending_files(*pending_files) - commit_snapshot() - - print(' push to %s %s -- dry_run: %s' % (remote, src_branch, dry_run)) - git_push(remote, src_branch, release_version, dry_run) - print(' publish artifacts to S3 -- dry_run: %s' % dry_run) - publish_artifacts(artifact_and_checksums, base='elasticsearch/%s' % (artifact_id) , dry_run=dry_run) - print(' preparing email (from github issues)') - msg = prepare_email(artifact_id, release_version, repository, artifact_name, artifact_description, project_url) - print(' sending email -- dry_run: %s, mail: %s' % (dry_run, mail)) - send_email(msg, dry_run=dry_run, mail=mail) - - pending_msg = """ -Release successful pending steps: - * close and release sonatype repo: https://oss.sonatype.org/ - * check if the release is there https://oss.sonatype.org/content/repositories/releases/org/elasticsearch/%(artifact_id)s/%(version)s - * tweet about the release -""" - print(pending_msg % {'version': release_version, - 'artifact_id': artifact_id, - 'project_url': project_url}) - success = True - finally: - if not success: - run('git reset --hard HEAD') - run('git checkout %s' % src_branch) - elif dry_run: - print('End of dry_run') - input('Press Enter to reset changes...') - - run('git reset --hard %s' % head_hash) - run('git tag -d v%s' % release_version) - # we delete this one anyways - run('git branch -D %s' % (release_branch(release_version))) diff --git a/dev-tools/release.py b/dev-tools/release.py new file mode 100644 index 00000000000..edcc637d068 --- /dev/null +++ b/dev-tools/release.py @@ -0,0 +1,134 @@ +# Licensed to Elasticsearch under one or more contributor +# license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright +# ownership. Elasticsearch licenses this file to you under +# the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on +# an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +# either express or implied. See the License for the specific +# language governing permissions and limitations under the License. + +import datetime +import os +import shutil +import sys +import time +import urllib +import urllib.request +import zipfile + +from os.path import dirname, abspath + +""" + This tool builds a release from the a given elasticsearch plugin branch. + + It is basically a wrapper on top of launch_release.py which: + + - tries to get a more recent version of launch_release.py in ... + - download it if needed + - launch it passing all arguments to it, like: + + $ python3 dev_tools/release.py --branch master --publish --remote origin + + Important options: + + # Dry run + $ python3 dev_tools/release.py + + # Dry run without tests + python3 dev_tools/release.py --skiptests + + # Release, publish artifacts and announce + $ python3 dev_tools/release.py --publish + + See full documentation in launch_release.py +""" +env = os.environ + +# Change this if the source repository for your scripts is at a different location +SOURCE_REPO = 'elasticsearch/elasticsearch-plugins-script' +# We define that we should download again the script after 1 days +SCRIPT_OBSOLETE_DAYS = 1 +# We ignore in master.zip file the following files +IGNORED_FILES = ['.gitignore', 'README.md'] + + +ROOT_DIR = abspath(os.path.join(abspath(dirname(__file__)), '../')) +TARGET_TOOLS_DIR = ROOT_DIR + '/plugin_tools' +DEV_TOOLS_DIR = ROOT_DIR + '/dev-tools' +BUILD_RELEASE_FILENAME = 'release.zip' +BUILD_RELEASE_FILE = TARGET_TOOLS_DIR + '/' + BUILD_RELEASE_FILENAME +SOURCE_URL = 'https://github.com/%s/archive/master.zip' % SOURCE_REPO + +# Download a recent version of the release plugin tool +try: + os.mkdir(TARGET_TOOLS_DIR) + print('directory %s created' % TARGET_TOOLS_DIR) +except FileExistsError: + pass + + +try: + # we check latest update. If we ran an update recently, we + # are not going to check it again + download = True + + try: + last_download_time = datetime.datetime.fromtimestamp(os.path.getmtime(BUILD_RELEASE_FILE)) + if (datetime.datetime.now()-last_download_time).days < SCRIPT_OBSOLETE_DAYS: + download = False + except FileNotFoundError: + pass + + if download: + urllib.request.urlretrieve(SOURCE_URL, BUILD_RELEASE_FILE) + with zipfile.ZipFile(BUILD_RELEASE_FILE) as myzip: + for member in myzip.infolist(): + filename = os.path.basename(member.filename) + # skip directories + if not filename: + continue + if filename in IGNORED_FILES: + continue + + # copy file (taken from zipfile's extract) + source = myzip.open(member.filename) + target = open(os.path.join(TARGET_TOOLS_DIR, filename), "wb") + with source, target: + shutil.copyfileobj(source, target) + # We keep the original date + date_time = time.mktime(member.date_time + (0, 0, -1)) + os.utime(os.path.join(TARGET_TOOLS_DIR, filename), (date_time, date_time)) + print('plugin-tools updated from %s' % SOURCE_URL) +except urllib.error.HTTPError: + pass + + +# Let see if we need to update the release.py script itself +source_time = os.path.getmtime(TARGET_TOOLS_DIR + '/release.py') +repo_time = os.path.getmtime(DEV_TOOLS_DIR + '/release.py') +if source_time > repo_time: + input('release.py needs an update. Press a key to update it...') + shutil.copyfile(TARGET_TOOLS_DIR + '/release.py', DEV_TOOLS_DIR + '/release.py') + +# We can launch the build process +try: + PYTHON = 'python' + # make sure python3 is used if python3 is available + # some systems use python 2 as default + os.system('python3 --version > /dev/null 2>&1') + PYTHON = 'python3' +except RuntimeError: + pass + +release_args = '' +for x in range(1, len(sys.argv)): + release_args += ' ' + sys.argv[x] + +os.system('%s %s/build_release.py %s' % (PYTHON, TARGET_TOOLS_DIR, release_args)) diff --git a/dev-tools/upload-s3.py b/dev-tools/upload-s3.py deleted file mode 100644 index 95ea576e65c..00000000000 --- a/dev-tools/upload-s3.py +++ /dev/null @@ -1,67 +0,0 @@ -# Licensed to Elasticsearch under one or more contributor -# license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright -# ownership. Elasticsearch licenses this file to you under -# the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on -# an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, -# either express or implied. See the License for the specific -# language governing permissions and limitations under the License. - -import os -import sys -import argparse -try: - import boto.s3 -except: - raise RuntimeError(""" - S3 upload requires boto to be installed - Use one of: - 'pip install -U boto' - 'apt-get install python-boto' - 'easy_install boto' - """) - -import boto.s3 - - -def list_buckets(conn): - return conn.get_all_buckets() - - -def upload_s3(conn, path, key, file, bucket): - print 'Uploading %s to Amazon S3 bucket %s/%s' % \ - (file, bucket, os.path.join(path, key)) - def percent_cb(complete, total): - sys.stdout.write('.') - sys.stdout.flush() - bucket = conn.create_bucket(bucket) - k = bucket.new_key(os.path.join(path, key)) - k.set_contents_from_filename(file, cb=percent_cb, num_cb=100) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Uploads files to Amazon S3') - parser.add_argument('--file', '-f', metavar='path to file', - help='the branch to release from', required=True) - parser.add_argument('--bucket', '-b', metavar='B42', default='download.elasticsearch.org', - help='The S3 Bucket to upload to') - parser.add_argument('--path', '-p', metavar='elasticsearch/elasticsearch', default='elasticsearch/elasticsearch', - help='The key path to use') - parser.add_argument('--key', '-k', metavar='key', default=None, - help='The key - uses the file name as default key') - args = parser.parse_args() - if args.key: - key = args.key - else: - key = os.path.basename(args.file) - - connection = boto.connect_s3() - upload_s3(connection, args.path, key, args.file, args.bucket); - From aab6d69a78e51873a8c3c6e0427067026de6bd56 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Mon, 8 Sep 2014 23:20:27 +0200 Subject: [PATCH 086/131] Update to Lucene 4.10.0 Closes #33. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 6ddf5431919..43cbe0a70fb 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ 2.0.0-SNAPSHOT - 4.9.0 + 4.10.0 1 true onerror From 5ee9cca15b371c5c6102044f87c7f438e28e8e47 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Tue, 9 Sep 2014 00:04:33 +0200 Subject: [PATCH 087/131] Fix missing line --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a5060ba7545..a502d852e37 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ In order to install the plugin, simply run: bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.3.0 ``` +You need to install a version matching your Elasticsearch version: | elasticsearch | ICU Analysis Plugin | Docs | |---------------|-----------------------|------------| From 710697bac09ccda68faeafb5a27b85395b8affdf Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 12 Sep 2014 16:02:43 +0200 Subject: [PATCH 088/131] Create branch es-1.4 for elasticsearch 1.4.0 --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a502d852e37..48daee10077 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,8 @@ You need to install a version matching your Elasticsearch version: | elasticsearch | ICU Analysis Plugin | Docs | |---------------|-----------------------|------------| | master | Build from source | See below | -| es-1.x | Build from source | [2.4.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.x/#version-240-snapshot-for-elasticsearch-1x) | +| es-1.x | Build from source | [2.5.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.x/#version-250-snapshot-for-elasticsearch-1x) | +| es-1.4 | Build from source | [2.4.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.4/#version-240-snapshot-for-elasticsearch-14) | | es-1.3 | 2.3.0 | [2.3.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.3.0/#icu-analysis-for-elasticsearch) | | es-1.2 | 2.2.0 | [2.2.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.2.0/#icu-analysis-for-elasticsearch) | | es-1.1 | 2.1.0 | [2.1.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.1.0/#icu-analysis-for-elasticsearch) | From fa6e6b27175ae96ab5af21fa43856a47e7f7941a Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Wed, 24 Sep 2014 16:51:28 -0400 Subject: [PATCH 089/131] Upgrade to Lucene 4.10.1 snapshot --- pom.xml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 43cbe0a70fb..94c888db2bf 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,8 @@ 2.0.0-SNAPSHOT - 4.10.0 + 4.10.1 + 4.10.1-snapshot-1627368 1 true onerror @@ -42,6 +43,10 @@ + + Lucene snapshots + https://download.elasticsearch.org/lucenesnapshots/ + sonatype http://oss.sonatype.org/content/repositories/releases/ @@ -58,7 +63,7 @@ org.apache.lucene lucene-test-framework - ${lucene.version} + ${lucene.maven.version} test @@ -72,7 +77,7 @@ org.apache.lucene lucene-analyzers-icu - ${lucene.version} + ${lucene.maven.version} compile From f62f97e0f88fe0bdb68301b4be0aa23631829e76 Mon Sep 17 00:00:00 2001 From: mikemccand Date: Sun, 28 Sep 2014 17:56:10 -0400 Subject: [PATCH 090/131] Upgrade to Lucene 4.10.1 --- pom.xml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index 94c888db2bf..34b1bcbb7d4 100644 --- a/pom.xml +++ b/pom.xml @@ -34,7 +34,7 @@ 2.0.0-SNAPSHOT 4.10.1 - 4.10.1-snapshot-1627368 + 4.10.1 1 true onerror @@ -43,10 +43,6 @@ - - Lucene snapshots - https://download.elasticsearch.org/lucenesnapshots/ - sonatype http://oss.sonatype.org/content/repositories/releases/ From 481a268931fa859c274c1c5e800954584107df85 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 2 Oct 2014 09:51:37 +0200 Subject: [PATCH 091/131] Update to elasticsearch 1.4.0.Beta1 (cherry picked from commit 9abbe92) (cherry picked from commit 890c483) --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 34b1bcbb7d4..44421befa1b 100644 --- a/pom.xml +++ b/pom.xml @@ -107,8 +107,8 @@ maven-compiler-plugin 2.3.2 - 1.6 - 1.6 + 1.7 + 1.7
From 77134b7c79d8dca34542f3a28733166f489878d6 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 2 Oct 2014 09:57:32 +0200 Subject: [PATCH 092/131] update documentation with release 2.4.0 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 48daee10077..3059b4ce1f8 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: ```sh -bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.3.0 +bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.4.0 ``` You need to install a version matching your Elasticsearch version: @@ -15,7 +15,7 @@ You need to install a version matching your Elasticsearch version: |---------------|-----------------------|------------| | master | Build from source | See below | | es-1.x | Build from source | [2.5.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.x/#version-250-snapshot-for-elasticsearch-1x) | -| es-1.4 | Build from source | [2.4.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.4/#version-240-snapshot-for-elasticsearch-14) | +| es-1.4 | 2.4.0 | [2.4.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.0/#version-240-for-elasticsearch-14) | | es-1.3 | 2.3.0 | [2.3.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.3.0/#icu-analysis-for-elasticsearch) | | es-1.2 | 2.2.0 | [2.2.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.2.0/#icu-analysis-for-elasticsearch) | | es-1.1 | 2.1.0 | [2.1.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.1.0/#icu-analysis-for-elasticsearch) | From e9e80407b389a1faa67f2dc18b6ceaf89d84642c Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 15 Oct 2014 17:17:31 +0200 Subject: [PATCH 093/131] Tests: index.version.created must be set Due to this [change](https://github.com/elasticsearch/elasticsearch/pull/8018), we need to fix our tests for elasticsearch 1.4.0 and above. Closes #41. (cherry picked from commit 75b800f) --- .../index/analysis/AnalysisTestUtils.java | 54 +++++++++++++++++++ .../analysis/SimpleIcuAnalysisTests.java | 27 +++------- .../SimpleIcuCollationTokenFilterTests.java | 40 ++++---------- .../SimpleIcuNormalizerCharFilterTests.java | 29 ++-------- 4 files changed, 74 insertions(+), 76 deletions(-) create mode 100644 src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java diff --git a/src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java b/src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java new file mode 100644 index 00000000000..dc619d22e2f --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java @@ -0,0 +1,54 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; + +import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; + +public class AnalysisTestUtils { + + public static AnalysisService createAnalysisService(Settings settings) { + Index index = new Index("test"); + Settings indexSettings = settingsBuilder().put(settings) + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .build(); + Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(index, indexSettings), + new IndexNameModule(index), + new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor())) + .createChildInjector(parentInjector); + + return injector.getInstance(AnalysisService.class); + } +} diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java index e12db59c6a8..8408a3231e0 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -19,20 +19,12 @@ package org.elasticsearch.index.analysis; -import org.elasticsearch.common.inject.Injector; -import org.elasticsearch.common.inject.ModulesBuilder; -import org.elasticsearch.common.settings.SettingsModule; -import org.elasticsearch.env.Environment; -import org.elasticsearch.env.EnvironmentModule; -import org.elasticsearch.index.Index; -import org.elasticsearch.index.IndexNameModule; -import org.elasticsearch.index.settings.IndexSettingsModule; -import org.elasticsearch.indices.analysis.IndicesAnalysisModule; -import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.elasticsearch.common.settings.Settings; import org.elasticsearch.test.ElasticsearchTestCase; import org.junit.Test; -import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS; +import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; +import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService; import static org.hamcrest.Matchers.instanceOf; /** */ @@ -40,16 +32,9 @@ public class SimpleIcuAnalysisTests extends ElasticsearchTestCase { @Test public void testDefaultsIcuAnalysis() { - Index index = new Index("test"); - - Injector parentInjector = new ModulesBuilder().add(new SettingsModule(EMPTY_SETTINGS), new EnvironmentModule(new Environment(EMPTY_SETTINGS)), new IndicesAnalysisModule()).createInjector(); - Injector injector = new ModulesBuilder().add( - new IndexSettingsModule(index, EMPTY_SETTINGS), - new IndexNameModule(index), - new AnalysisModule(EMPTY_SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor())) - .createChildInjector(parentInjector); - - AnalysisService analysisService = injector.getInstance(AnalysisService.class); + Settings settings = settingsBuilder() + .loadFromClasspath("org/elasticsearch/index/analysis/phonetic-1.yml").build(); + AnalysisService analysisService = createAnalysisService(settings); TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer"); assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class)); diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java index 39440e1e723..5098a791f66 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java @@ -25,24 +25,16 @@ import com.ibm.icu.util.ULocale; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.elasticsearch.common.inject.Injector; -import org.elasticsearch.common.inject.ModulesBuilder; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.settings.SettingsModule; -import org.elasticsearch.env.Environment; -import org.elasticsearch.env.EnvironmentModule; import org.elasticsearch.index.Index; -import org.elasticsearch.index.IndexNameModule; -import org.elasticsearch.index.settings.IndexSettingsModule; -import org.elasticsearch.indices.analysis.IndicesAnalysisModule; -import org.elasticsearch.indices.analysis.IndicesAnalysisService; import org.elasticsearch.test.ElasticsearchTestCase; import org.junit.Test; import java.io.IOException; import java.io.StringReader; +import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService; import static org.hamcrest.Matchers.equalTo; // Tests borrowed from Solr's Icu collation key filter factory test. @@ -62,7 +54,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { .put("index.analysis.filter.myCollator.language", "tr") .put("index.analysis.filter.myCollator.strength", "primary") .build(); - AnalysisService analysisService = createAnalysisService(index, settings); + AnalysisService analysisService = createAnalysisService(settings); String turkishUpperCase = "I WİLL USE TURKİSH CASING"; String turkishLowerCase = "ı will use turkish casıng"; @@ -85,7 +77,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { .put("index.analysis.filter.myCollator.strength", "primary") .put("index.analysis.filter.myCollator.decomposition", "canonical") .build(); - AnalysisService analysisService = createAnalysisService(index, settings); + AnalysisService analysisService = createAnalysisService(settings); String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING"; String turkishLowerCase = "ı will use turkish casıng"; @@ -108,7 +100,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { .put("index.analysis.filter.myCollator.strength", "secondary") .put("index.analysis.filter.myCollator.decomposition", "no") .build(); - AnalysisService analysisService = createAnalysisService(index, settings); + AnalysisService analysisService = createAnalysisService(settings); String upperCase = "TESTING"; String lowerCase = "testing"; @@ -132,7 +124,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { .put("index.analysis.filter.myCollator.strength", "primary") .put("index.analysis.filter.myCollator.alternate", "shifted") .build(); - AnalysisService analysisService = createAnalysisService(index, settings); + AnalysisService analysisService = createAnalysisService(settings); String withPunctuation = "foo-bar"; String withoutPunctuation = "foo bar"; @@ -157,7 +149,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { .put("index.analysis.filter.myCollator.alternate", "shifted") .put("index.analysis.filter.myCollator.variableTop", " ") .build(); - AnalysisService analysisService = createAnalysisService(index, settings); + AnalysisService analysisService = createAnalysisService(settings); String withSpace = "foo bar"; String withoutSpace = "foobar"; @@ -185,7 +177,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { .put("index.analysis.filter.myCollator.language", "en") .put("index.analysis.filter.myCollator.numeric", "true") .build(); - AnalysisService analysisService = createAnalysisService(index, settings); + AnalysisService analysisService = createAnalysisService(settings); String nine = "foobar-9"; String ten = "foobar-10"; @@ -209,7 +201,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { .put("index.analysis.filter.myCollator.strength", "primary") .put("index.analysis.filter.myCollator.caseLevel", "true") .build(); - AnalysisService analysisService = createAnalysisService(index, settings); + AnalysisService analysisService = createAnalysisService(settings); String withAccents = "résumé"; String withoutAccents = "resume"; @@ -244,7 +236,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { .put("index.analysis.filter.myCollator.strength", "tertiary") .put("index.analysis.filter.myCollator.caseFirst", "upper") .build(); - AnalysisService analysisService = createAnalysisService(index, settings); + AnalysisService analysisService = createAnalysisService(settings); String lower = "resume"; String upper = "Resume"; @@ -273,13 +265,12 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); String tailoredRules = tailoredCollator.getRules(); - Index index = new Index("test"); Settings settings = ImmutableSettings.settingsBuilder() .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.rules", tailoredRules) .put("index.analysis.filter.myCollator.strength", "primary") .build(); - AnalysisService analysisService = createAnalysisService(index, settings); + AnalysisService analysisService = createAnalysisService(settings); String germanUmlaut = "Töne"; String germanOE = "Toene"; @@ -289,17 +280,6 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { assertCollatesToSame(tsUmlaut, tsOE); } - private AnalysisService createAnalysisService(Index index, Settings settings) { - Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); - Injector injector = new ModulesBuilder().add( - new IndexSettingsModule(index, settings), - new IndexNameModule(index), - new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor())) - .createChildInjector(parentInjector); - - return injector.getInstance(AnalysisService.class); - } - private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException { assertCollation(stream1, stream2, 0); } diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java index c4cbb945e4c..b2e6e3dfb49 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java @@ -21,23 +21,15 @@ package org.elasticsearch.index.analysis; import com.ibm.icu.text.Normalizer2; import org.apache.lucene.analysis.CharFilter; -import org.elasticsearch.common.inject.Injector; -import org.elasticsearch.common.inject.ModulesBuilder; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.settings.SettingsModule; -import org.elasticsearch.env.Environment; -import org.elasticsearch.env.EnvironmentModule; -import org.elasticsearch.index.Index; -import org.elasticsearch.index.IndexNameModule; -import org.elasticsearch.index.settings.IndexSettingsModule; -import org.elasticsearch.indices.analysis.IndicesAnalysisModule; -import org.elasticsearch.indices.analysis.IndicesAnalysisService; import org.elasticsearch.test.ElasticsearchTestCase; import org.junit.Test; import java.io.StringReader; +import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService; + /** * Test */ @@ -46,11 +38,10 @@ public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase { @Test public void testDefaultSetting() throws Exception { - Index index = new Index("test"); Settings settings = ImmutableSettings.settingsBuilder() .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer") .build(); - AnalysisService analysisService = createAnalysisService(index, settings); + AnalysisService analysisService = createAnalysisService(settings); CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar"); String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि"; @@ -72,13 +63,12 @@ public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase { @Test public void testNameAndModeSetting() throws Exception { - Index index = new Index("test"); Settings settings = ImmutableSettings.settingsBuilder() .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer") .put("index.analysis.char_filter.myNormalizerChar.name", "nfkc") .put("index.analysis.char_filter.myNormalizerChar.mode", "decompose") .build(); - AnalysisService analysisService = createAnalysisService(index, settings); + AnalysisService analysisService = createAnalysisService(settings); CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar"); String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि"; @@ -95,15 +85,4 @@ public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase { } assertEquals(expectedOutput, output.toString()); } - - private AnalysisService createAnalysisService(Index index, Settings settings) { - Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); - Injector injector = new ModulesBuilder().add( - new IndexSettingsModule(index, settings), - new IndexNameModule(index), - new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor())) - .createChildInjector(parentInjector); - - return injector.getInstance(AnalysisService.class); - } } From eb7a8a427c91bab057d5d2473cc9bf206120b68b Mon Sep 17 00:00:00 2001 From: Jun Ohtani Date: Thu, 30 Oct 2014 14:29:24 +0900 Subject: [PATCH 094/131] Tests: Fix randomizedtest fail Closes #43 --- pom.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pom.xml b/pom.xml index 44421befa1b..f51bad4891f 100644 --- a/pom.xml +++ b/pom.xml @@ -56,6 +56,12 @@ 1.3 test + + com.carrotsearch.randomizedtesting + randomizedtesting-runner + 2.1.10 + test + org.apache.lucene lucene-test-framework From cec157dafa81f89d02ce7df9143467d03ba9b577 Mon Sep 17 00:00:00 2001 From: Jun Ohtani Date: Thu, 30 Oct 2014 14:30:28 +0900 Subject: [PATCH 095/131] Update to Lucene 4.10.2 Closes #44 --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index f51bad4891f..055c3882a58 100644 --- a/pom.xml +++ b/pom.xml @@ -33,8 +33,8 @@ 2.0.0-SNAPSHOT - 4.10.1 - 4.10.1 + 4.10.2 + 4.10.2 1 true onerror From 472c21a138584b704061571341bae0503e657c76 Mon Sep 17 00:00:00 2001 From: tlrx Date: Wed, 5 Nov 2014 17:30:57 +0100 Subject: [PATCH 096/131] update documentation with release 2.4.1 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3059b4ce1f8..491053e4b9c 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: ```sh -bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.4.0 +bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.4.1 ``` You need to install a version matching your Elasticsearch version: @@ -15,7 +15,7 @@ You need to install a version matching your Elasticsearch version: |---------------|-----------------------|------------| | master | Build from source | See below | | es-1.x | Build from source | [2.5.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.x/#version-250-snapshot-for-elasticsearch-1x) | -| es-1.4 | 2.4.0 | [2.4.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.0/#version-240-for-elasticsearch-14) | +| es-1.4 | 2.4.1 | [2.4.1](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.1/#version-241-for-elasticsearch-14) | | es-1.3 | 2.3.0 | [2.3.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.3.0/#icu-analysis-for-elasticsearch) | | es-1.2 | 2.2.0 | [2.2.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.2.0/#icu-analysis-for-elasticsearch) | | es-1.1 | 2.1.0 | [2.1.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.1.0/#icu-analysis-for-elasticsearch) | From c2c034583717e122fa25ae91cc954638203ead22 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 5 Nov 2014 16:25:33 -0500 Subject: [PATCH 097/131] upgrade to lucene 5 snapshot (will open issue about collators) --- README.md | 83 +---- pom.xml | 8 +- .../IcuCollationTokenFilterFactory.java | 4 +- .../index/analysis/IcuTokenizerFactory.java | 4 +- .../indices/analysis/IcuIndicesAnalysis.java | 20 +- .../index/analysis/ICUIntegrationTests.java | 6 +- .../SimpleIcuCollationTokenFilterTests.java | 303 ------------------ 7 files changed, 18 insertions(+), 410 deletions(-) delete mode 100644 src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java diff --git a/README.md b/README.md index 491053e4b9c..40aab5a20ce 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Normalizes characters as explained [here](http://userguide.icu-project.org/trans "index" : { "analysis" : { "analyzer" : { - "collation" : { + "normalized" : { "tokenizer" : "keyword", "filter" : ["icu_normalizer"] } @@ -61,7 +61,7 @@ Folding of unicode characters based on `UTR#30`. It registers itself under `icu_ "index" : { "analysis" : { "analyzer" : { - "collation" : { + "folded" : { "tokenizer" : "keyword", "filter" : ["icu_folding"] } @@ -101,81 +101,6 @@ The Following example exempts Swedish characters from the folding. Note that the } ``` -ICU Collation -------------- - -Uses collation token filter. Allows to either specify the rules for collation -(defined [here](http://www.icu-project.org/userguide/Collate_Customization.html)) using the `rules` parameter -(can point to a location or expressed in the settings, location can be relative to config location), or using the -`language` parameter (further specialized by country and variant). By default registers under `icu_collation` or -`icuCollation` and uses the default locale. - -Here is a sample settings: - -```js -{ - "index" : { - "analysis" : { - "analyzer" : { - "collation" : { - "tokenizer" : "keyword", - "filter" : ["icu_collation"] - } - } - } - } -} -``` - -And here is a sample of custom collation: - -```js -{ - "index" : { - "analysis" : { - "analyzer" : { - "collation" : { - "tokenizer" : "keyword", - "filter" : ["myCollator"] - } - }, - "filter" : { - "myCollator" : { - "type" : "icu_collation", - "language" : "en" - } - } - } - } -} -``` - -Optional options: -* `strength` - The strength property determines the minimum level of difference considered significant during comparison. - The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator. - Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`. - See [ICU Collation](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html) documentation for a more detailed - explanation for the specific values. -* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with -`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were -normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form -before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between -faster and more complete collation behavior. Since a great many of the world's languages do not require text -normalization, most locales set `no` as the default decomposition mode. - -Expert options: -* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary` - to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace. -* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When - strength is set to `primary` this will ignore accent differences. -* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored - for strength `tertiary`. -* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For - example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`. -* `variableTop` - Single character or contraction. Controls what is variable for `alternate`. -* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana - and Hiragana characters in `quaternary` strength . - ICU Tokenizer ------------- @@ -186,7 +111,7 @@ Breaks text into words according to [UAX #29: Unicode Text Segmentation](http:// "index" : { "analysis" : { "analyzer" : { - "collation" : { + "tokenized" : { "tokenizer" : "icu_tokenizer", } } @@ -211,7 +136,7 @@ Here is a sample settings: "index" : { "analysis" : { "analyzer" : { - "collation" : { + "normalized" : { "tokenizer" : "keyword", "char_filter" : ["icu_normalizer"] } diff --git a/pom.xml b/pom.xml index 055c3882a58..938ddef943e 100644 --- a/pom.xml +++ b/pom.xml @@ -33,8 +33,8 @@ 2.0.0-SNAPSHOT - 4.10.2 - 4.10.2 + 5.0.0 + 5.0.0-snapshot-1636426 1 true onerror @@ -47,6 +47,10 @@ sonatype http://oss.sonatype.org/content/repositories/releases/ + + Lucene snapshots + https://download.elasticsearch.org/lucenesnapshots/maven/ + diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java index 0e2a9799daf..0e2bc7a13bb 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -23,7 +23,6 @@ import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.collation.ICUCollationKeyFilter; import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; @@ -174,6 +173,7 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { @Override public TokenStream create(TokenStream tokenStream) { - return new ICUCollationKeyFilter(tokenStream, collator); + throw new UnsupportedOperationException("i was deprecated in lucene 4, and now i'm gone"); + // TODO: lucene does sort keys as binary keys since 4.x } } diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java index 168e85f8d2a..fe20d93069e 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java @@ -39,8 +39,8 @@ public class IcuTokenizerFactory extends AbstractTokenizerFactory { } @Override - public Tokenizer create(Reader reader) { - return new ICUTokenizer(reader); + public Tokenizer create() { + return new ICUTokenizer(); } } diff --git a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java index eb0edf76185..a4330efd0d2 100644 --- a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java +++ b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java @@ -19,7 +19,6 @@ package org.elasticsearch.indices.analysis; -import com.ibm.icu.text.Collator; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.Transliterator; import org.apache.lucene.analysis.TokenStream; @@ -27,7 +26,6 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.icu.ICUFoldingFilter; import org.apache.lucene.analysis.icu.ICUTransformFilter; import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; -import org.apache.lucene.collation.ICUCollationKeyFilter; import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.settings.Settings; @@ -36,8 +34,6 @@ import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; -import java.io.Reader; - /** * Registers indices level analysis components so, if not explicitly configured, will be shared * among all indices. @@ -55,8 +51,8 @@ public class IcuIndicesAnalysis extends AbstractComponent { } @Override - public Tokenizer create(Reader reader) { - return new ICUTokenizer(reader); + public Tokenizer create() { + return new ICUTokenizer(); } })); @@ -85,18 +81,6 @@ public class IcuIndicesAnalysis extends AbstractComponent { } })); - indicesAnalysisService.tokenFilterFactories().put("icu_collation", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "icu_collation"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new ICUCollationKeyFilter(tokenStream, Collator.getInstance()); - } - })); - indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { diff --git a/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java b/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java index 95874c98b07..d8a13b0febe 100644 --- a/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java @@ -52,10 +52,8 @@ public class ICUIntegrationTests extends ElasticsearchIntegrationTest { Settings settings = ImmutableSettings.builder() .put(super.indexSettings()) .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") - .putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_collator") - .put("index.analysis.filter.my_collator.type", "icu_collation") - .put("index.analysis.filter.my_collator.language", "en") - .put("index.analysis.filter.my_collator.strength", "primary") + .putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "my_folding") + .put("index.analysis.filter.my_folding.type", "icu_folding") .build(); return settings; diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java deleted file mode 100644 index 5098a791f66..00000000000 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java +++ /dev/null @@ -1,303 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.analysis; - -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.util.ULocale; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.elasticsearch.common.settings.ImmutableSettings; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.Index; -import org.elasticsearch.test.ElasticsearchTestCase; -import org.junit.Test; - -import java.io.IOException; -import java.io.StringReader; - -import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService; -import static org.hamcrest.Matchers.equalTo; - -// Tests borrowed from Solr's Icu collation key filter factory test. -public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { - - /* - * Turkish has some funny casing. - * This test shows how you can solve this kind of thing easily with collation. - * Instead of using LowerCaseFilter, use a turkish collator with primary strength. - * Then things will sort and match correctly. - */ - @Test - public void testBasicUsage() throws Exception { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "tr") - .put("index.analysis.filter.myCollator.strength", "primary") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String turkishUpperCase = "I WİLL USE TURKİSH CASING"; - String turkishLowerCase = "ı will use turkish casıng"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase))); - TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase))); - assertCollatesToSame(tsUpper, tsLower); - } - - /* - * Test usage of the decomposition option for unicode normalization. - */ - @Test - public void testNormalization() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "tr") - .put("index.analysis.filter.myCollator.strength", "primary") - .put("index.analysis.filter.myCollator.decomposition", "canonical") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING"; - String turkishLowerCase = "ı will use turkish casıng"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase))); - TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase))); - assertCollatesToSame(tsUpper, tsLower); - } - - /* - * Test secondary strength, for english case is not significant. - */ - @Test - public void testSecondaryStrength() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "en") - .put("index.analysis.filter.myCollator.strength", "secondary") - .put("index.analysis.filter.myCollator.decomposition", "no") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String upperCase = "TESTING"; - String lowerCase = "testing"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upperCase))); - TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lowerCase))); - assertCollatesToSame(tsUpper, tsLower); - } - - /* - * Setting alternate=shifted to shift whitespace, punctuation and symbols - * to quaternary level - */ - @Test - public void testIgnorePunctuation() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "en") - .put("index.analysis.filter.myCollator.strength", "primary") - .put("index.analysis.filter.myCollator.alternate", "shifted") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String withPunctuation = "foo-bar"; - String withoutPunctuation = "foo bar"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation))); - TokenStream tsWithoutPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withoutPunctuation))); - assertCollatesToSame(tsPunctuation, tsWithoutPunctuation); - } - - /* - * Setting alternate=shifted and variableTop to shift whitespace, but not - * punctuation or symbols, to quaternary level - */ - @Test - public void testIgnoreWhitespace() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "en") - .put("index.analysis.filter.myCollator.strength", "primary") - .put("index.analysis.filter.myCollator.alternate", "shifted") - .put("index.analysis.filter.myCollator.variableTop", " ") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String withSpace = "foo bar"; - String withoutSpace = "foobar"; - String withPunctuation = "foo-bar"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace))); - TokenStream tsWithoutSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withoutSpace))); - assertCollatesToSame(tsWithSpace, tsWithoutSpace); - // now assert that punctuation still matters: foo-bar < foo bar - tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace))); - TokenStream tsWithPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation))); - assertCollation(tsWithPunctuation, tsWithSpace, -1); - } - - /* - * Setting numeric to encode digits with numeric value, so that - * foobar-9 sorts before foobar-10 - */ - @Test - public void testNumerics() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "en") - .put("index.analysis.filter.myCollator.numeric", "true") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String nine = "foobar-9"; - String ten = "foobar-10"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsNine = filterFactory.create(new KeywordTokenizer(new StringReader(nine))); - TokenStream tsTen = filterFactory.create(new KeywordTokenizer(new StringReader(ten))); - assertCollation(tsNine, tsTen, -1); - } - - /* - * Setting caseLevel=true to create an additional case level between - * secondary and tertiary - */ - @Test - public void testIgnoreAccentsButNotCase() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "en") - .put("index.analysis.filter.myCollator.strength", "primary") - .put("index.analysis.filter.myCollator.caseLevel", "true") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String withAccents = "résumé"; - String withoutAccents = "resume"; - String withAccentsUpperCase = "Résumé"; - String withoutAccentsUpperCase = "Resume"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsWithAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withAccents))); - TokenStream tsWithoutAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents))); - assertCollatesToSame(tsWithAccents, tsWithoutAccents); - - TokenStream tsWithAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withAccentsUpperCase))); - TokenStream tsWithoutAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase))); - assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase); - - // now assert that case still matters: resume < Resume - TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents))); - TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase))); - assertCollation(tsLower, tsUpper, -1); - } - - /* - * Setting caseFirst=upper to cause uppercase strings to sort - * before lowercase ones. - */ - @Test - public void testUpperCaseFirst() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "en") - .put("index.analysis.filter.myCollator.strength", "tertiary") - .put("index.analysis.filter.myCollator.caseFirst", "upper") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String lower = "resume"; - String upper = "Resume"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lower))); - TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upper))); - assertCollation(tsUpper, tsLower, -1); - } - - /* - * For german, you might want oe to sort and match with o umlaut. - * This is not the default, but you can make a customized ruleset to do this. - * - * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior. - * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383 - */ - @Test - public void testCustomRules() throws Exception { - RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE")); - String DIN5007_2_tailorings = - "& ae , a\u0308 & AE , A\u0308"+ - "& oe , o\u0308 & OE , O\u0308"+ - "& ue , u\u0308 & UE , u\u0308"; - - RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); - String tailoredRules = tailoredCollator.getRules(); - - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.rules", tailoredRules) - .put("index.analysis.filter.myCollator.strength", "primary") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String germanUmlaut = "Töne"; - String germanOE = "Toene"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - TokenStream tsUmlaut = filterFactory.create(new KeywordTokenizer(new StringReader(germanUmlaut))); - TokenStream tsOE = filterFactory.create(new KeywordTokenizer(new StringReader(germanOE))); - assertCollatesToSame(tsUmlaut, tsOE); - } - - private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException { - assertCollation(stream1, stream2, 0); - } - - private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException { - CharTermAttribute term1 = stream1 - .addAttribute(CharTermAttribute.class); - CharTermAttribute term2 = stream2 - .addAttribute(CharTermAttribute.class); - - stream1.reset(); - stream2.reset(); - - assertThat(stream1.incrementToken(), equalTo(true)); - assertThat(stream2.incrementToken(), equalTo(true)); - assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison))); - assertThat(stream1.incrementToken(), equalTo(false)); - assertThat(stream2.incrementToken(), equalTo(false)); - } - -} From e45308d9e7f31325143b97a92a6c42167049bed0 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 6 Nov 2014 03:11:20 -0500 Subject: [PATCH 098/131] add back collation (still the way it was working before) --- README.md | 75 ++++++ .../index/analysis/ICUCollationKeyFilter.java | 109 ++++++++ .../IcuCollationTokenFilterFactory.java | 3 +- .../analysis/IndexableBinaryStringTools.java | 241 +++++++++++++++++ .../indices/analysis/IcuIndicesAnalysis.java | 14 + .../index/analysis/ICUIntegrationTests.java | 6 +- .../SimpleIcuCollationTokenFilterTests.java | 255 ++++++++++++++++++ .../TestIndexableBinaryStringTools.java | 251 +++++++++++++++++ 8 files changed, 950 insertions(+), 4 deletions(-) create mode 100644 src/main/java/org/elasticsearch/index/analysis/ICUCollationKeyFilter.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/IndexableBinaryStringTools.java create mode 100644 src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java create mode 100644 src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java diff --git a/README.md b/README.md index 40aab5a20ce..afa9150215e 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,81 @@ The Following example exempts Swedish characters from the folding. Note that the } ``` +ICU Collation +------------- + +Uses collation token filter. Allows to either specify the rules for collation +(defined [here](http://www.icu-project.org/userguide/Collate_Customization.html)) using the `rules` parameter +(can point to a location or expressed in the settings, location can be relative to config location), or using the +`language` parameter (further specialized by country and variant). By default registers under `icu_collation` or +`icuCollation` and uses the default locale. + +Here is a sample settings: + +```js +{ + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "filter" : ["icu_collation"] + } + } + } + } +} +``` + +And here is a sample of custom collation: + +```js +{ + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "filter" : ["myCollator"] + } + }, + "filter" : { + "myCollator" : { + "type" : "icu_collation", + "language" : "en" + } + } + } + } +} +``` + +Optional options: +* `strength` - The strength property determines the minimum level of difference considered significant during comparison. + The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator. + Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`. + See [ICU Collation](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html) documentation for a more detailed + explanation for the specific values. +* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with +`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were +normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form +before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between +faster and more complete collation behavior. Since a great many of the world's languages do not require text +normalization, most locales set `no` as the default decomposition mode. + +Expert options: +* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary` + to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace. +* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When + strength is set to `primary` this will ignore accent differences. +* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored + for strength `tertiary`. +* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For + example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`. +* `variableTop` - Single character or contraction. Controls what is variable for `alternate`. +* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana + and Hiragana characters in `quaternary` strength . + ICU Tokenizer ------------- diff --git a/src/main/java/org/elasticsearch/index/analysis/ICUCollationKeyFilter.java b/src/main/java/org/elasticsearch/index/analysis/ICUCollationKeyFilter.java new file mode 100644 index 00000000000..d55be9203e0 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/ICUCollationKeyFilter.java @@ -0,0 +1,109 @@ +package org.elasticsearch.index.analysis; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RawCollationKey; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import java.io.IOException; + +/** + *

+ * Converts each token into its {@link com.ibm.icu.text.CollationKey}, and + * then encodes the CollationKey with {@link IndexableBinaryStringTools}, to + * allow it to be stored as an index term. + *

+ *

+ * WARNING: Make sure you use exactly the same Collator at + * index and query time -- CollationKeys are only comparable when produced by + * the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are + * independently versioned, so it is safe to search against stored + * CollationKeys if the following are exactly the same (best practice is + * to store this information with the index and check that they remain the + * same at query time): + *

+ *
    + *
  1. + * Collator version - see {@link Collator#getVersion()} + *
  2. + *
  3. + * The collation strength used - see {@link Collator#setStrength(int)} + *
  4. + *
+ *

+ * CollationKeys generated by ICU Collators are not compatible with those + * generated by java.text.Collators. Specifically, if you use + * ICUCollationKeyFilter to generate index terms, do not use + * {@code CollationKeyFilter} on the query side, or vice versa. + *

+ *

+ * ICUCollationKeyFilter is significantly faster and generates significantly + * shorter keys than CollationKeyFilter. See + * http://site.icu-project.org/charts/collation-icu4j-sun for key + * generation timing and key length comparisons between ICU4J and + * java.text.Collator over several languages. + *

+ * @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes + * terms directly as bytes. This filter WAS removed in Lucene 5.0 + */ +@Deprecated +public final class ICUCollationKeyFilter extends TokenFilter { + private Collator collator = null; + private RawCollationKey reusableKey = new RawCollationKey(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + /** + * + * @param input Source token stream + * @param collator CollationKey generator + */ + public ICUCollationKeyFilter(TokenStream input, Collator collator) { + super(input); + // clone the collator: see http://userguide.icu-project.org/collation/architecture + try { + this.collator = (Collator) collator.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] termBuffer = termAtt.buffer(); + String termText = new String(termBuffer, 0, termAtt.length()); + collator.getRawCollationKey(termText, reusableKey); + int encodedLength = IndexableBinaryStringTools.getEncodedLength( + reusableKey.bytes, 0, reusableKey.size); + if (encodedLength > termBuffer.length) { + termAtt.resizeBuffer(encodedLength); + } + termAtt.setLength(encodedLength); + IndexableBinaryStringTools.encode(reusableKey.bytes, 0, reusableKey.size, + termAtt.buffer(), 0, encodedLength); + return true; + } else { + return false; + } + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java index 0e2bc7a13bb..489deafe313 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -173,7 +173,6 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { @Override public TokenStream create(TokenStream tokenStream) { - throw new UnsupportedOperationException("i was deprecated in lucene 4, and now i'm gone"); - // TODO: lucene does sort keys as binary keys since 4.x + return new ICUCollationKeyFilter(tokenStream, collator); } } diff --git a/src/main/java/org/elasticsearch/index/analysis/IndexableBinaryStringTools.java b/src/main/java/org/elasticsearch/index/analysis/IndexableBinaryStringTools.java new file mode 100644 index 00000000000..b8ae222e39a --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IndexableBinaryStringTools.java @@ -0,0 +1,241 @@ +package org.elasticsearch.index.analysis; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadoc + +/** + * Provides support for converting byte sequences to Strings and back again. + * The resulting Strings preserve the original byte sequences' sort order. + *

+ * The Strings are constructed using a Base 8000h encoding of the original + * binary data - each char of an encoded String represents a 15-bit chunk + * from the byte sequence. Base 8000h was chosen because it allows for all + * lower 15 bits of char to be used without restriction; the surrogate range + * [U+D8000-U+DFFF] does not represent valid chars, and would require + * complicated handling to avoid them and allow use of char's high bit. + *

+ * Although unset bits are used as padding in the final char, the original + * byte sequence could contain trailing bytes with no set bits (null bytes): + * padding is indistinguishable from valid information. To overcome this + * problem, a char is appended, indicating the number of encoded bytes in the + * final content char. + *

+ * + * @lucene.experimental + * @deprecated Implement {@link TermToBytesRefAttribute} and store bytes directly + * instead. This class WAS removed in Lucene 5.0 + */ +@Deprecated +public final class IndexableBinaryStringTools { + + private static final CodingCase[] CODING_CASES = { + // CodingCase(int initialShift, int finalShift) + new CodingCase( 7, 1 ), + // CodingCase(int initialShift, int middleShift, int finalShift) + new CodingCase(14, 6, 2), + new CodingCase(13, 5, 3), + new CodingCase(12, 4, 4), + new CodingCase(11, 3, 5), + new CodingCase(10, 2, 6), + new CodingCase( 9, 1, 7), + new CodingCase( 8, 0 ) + }; + + // Export only static methods + private IndexableBinaryStringTools() {} + + /** + * Returns the number of chars required to encode the given bytes. + * + * @param inputArray byte sequence to be encoded + * @param inputOffset initial offset into inputArray + * @param inputLength number of bytes in inputArray + * @return The number of chars required to encode the number of bytes. + */ + public static int getEncodedLength(byte[] inputArray, int inputOffset, + int inputLength) { + // Use long for intermediaries to protect against overflow + return (int)((8L * inputLength + 14L) / 15L) + 1; + } + + /** + * Returns the number of bytes required to decode the given char sequence. + * + * @param encoded char sequence to be decoded + * @param offset initial offset + * @param length number of characters + * @return The number of bytes required to decode the given char sequence + */ + public static int getDecodedLength(char[] encoded, int offset, int length) { + final int numChars = length - 1; + if (numChars <= 0) { + return 0; + } else { + // Use long for intermediaries to protect against overflow + final long numFullBytesInFinalChar = encoded[offset + length - 1]; + final long numEncodedChars = numChars - 1; + return (int)((numEncodedChars * 15L + 7L) / 8L + numFullBytesInFinalChar); + } + } + + /** + * Encodes the input byte sequence into the output char sequence. Before + * calling this method, ensure that the output array has sufficient + * capacity by calling {@link #getEncodedLength(byte[], int, int)}. + * + * @param inputArray byte sequence to be encoded + * @param inputOffset initial offset into inputArray + * @param inputLength number of bytes in inputArray + * @param outputArray char sequence to store encoded result + * @param outputOffset initial offset into outputArray + * @param outputLength length of output, must be getEncodedLength + */ + public static void encode(byte[] inputArray, int inputOffset, + int inputLength, char[] outputArray, int outputOffset, int outputLength) { + assert (outputLength == getEncodedLength(inputArray, inputOffset, + inputLength)); + if (inputLength > 0) { + int inputByteNum = inputOffset; + int caseNum = 0; + int outputCharNum = outputOffset; + CodingCase codingCase; + for (; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength; ++outputCharNum) { + codingCase = CODING_CASES[caseNum]; + if (2 == codingCase.numBytes) { + outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + + (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF); + } else { // numBytes is 3 + outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift) + + (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF); + } + inputByteNum += codingCase.advanceBytes; + if (++caseNum == CODING_CASES.length) { + caseNum = 0; + } + } + // Produce final char (if any) and trailing count chars. + codingCase = CODING_CASES[caseNum]; + + if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3 + outputArray[outputCharNum++] = (char) ((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) & (short) 0x7FFF); + // Add trailing char containing the number of full bytes in final char + outputArray[outputCharNum++] = (char) 1; + } else if (inputByteNum < inputLength) { + outputArray[outputCharNum++] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) & (short) 0x7FFF); + // Add trailing char containing the number of full bytes in final char + outputArray[outputCharNum++] = caseNum == 0 ? (char) 1 : (char) 0; + } else { // No left over bits - last char is completely filled. + // Add trailing char containing the number of full bytes in final char + outputArray[outputCharNum++] = (char) 1; + } + } + } + + /** + * Decodes the input char sequence into the output byte sequence. Before + * calling this method, ensure that the output array has sufficient capacity + * by calling {@link #getDecodedLength(char[], int, int)}. + * + * @param inputArray char sequence to be decoded + * @param inputOffset initial offset into inputArray + * @param inputLength number of chars in inputArray + * @param outputArray byte sequence to store encoded result + * @param outputOffset initial offset into outputArray + * @param outputLength length of output, must be + * getDecodedLength(inputArray, inputOffset, inputLength) + */ + public static void decode(char[] inputArray, int inputOffset, + int inputLength, byte[] outputArray, int outputOffset, int outputLength) { + assert (outputLength == getDecodedLength(inputArray, inputOffset, + inputLength)); + final int numInputChars = inputLength - 1; + final int numOutputBytes = outputLength; + + if (numOutputBytes > 0) { + int caseNum = 0; + int outputByteNum = outputOffset; + int inputCharNum = inputOffset; + short inputChar; + CodingCase codingCase; + for (; inputCharNum < numInputChars - 1; ++inputCharNum) { + codingCase = CODING_CASES[caseNum]; + inputChar = (short) inputArray[inputCharNum]; + if (2 == codingCase.numBytes) { + if (0 == caseNum) { + outputArray[outputByteNum] = (byte) (inputChar >>> codingCase.initialShift); + } else { + outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift); + } + outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift); + } else { // numBytes is 3 + outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift); + outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift); + outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift); + } + outputByteNum += codingCase.advanceBytes; + if (++caseNum == CODING_CASES.length) { + caseNum = 0; + } + } + // Handle final char + inputChar = (short) inputArray[inputCharNum]; + codingCase = CODING_CASES[caseNum]; + if (0 == caseNum) { + outputArray[outputByteNum] = 0; + } + outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift); + final int bytesLeft = numOutputBytes - outputByteNum; + if (bytesLeft > 1) { + if (2 == codingCase.numBytes) { + outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) >>> codingCase.finalShift); + } else { // numBytes is 3 + outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift); + if (bytesLeft > 2) { + outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift); + } + } + } + } + } + + static class CodingCase { + int numBytes, initialShift, middleShift, finalShift, advanceBytes = 2; + short middleMask, finalMask; + + CodingCase(int initialShift, int middleShift, int finalShift) { + this.numBytes = 3; + this.initialShift = initialShift; + this.middleShift = middleShift; + this.finalShift = finalShift; + this.finalMask = (short)((short)0xFF >>> finalShift); + this.middleMask = (short)((short)0xFF << middleShift); + } + + CodingCase(int initialShift, int finalShift) { + this.numBytes = 2; + this.initialShift = initialShift; + this.finalShift = finalShift; + this.finalMask = (short)((short)0xFF >>> finalShift); + if (finalShift != 0) { + advanceBytes = 1; + } + } + } +} diff --git a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java index a4330efd0d2..7ca0a089bbf 100644 --- a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java +++ b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java @@ -19,6 +19,7 @@ package org.elasticsearch.indices.analysis; +import com.ibm.icu.text.Collator; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.Transliterator; import org.apache.lucene.analysis.TokenStream; @@ -29,6 +30,7 @@ import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.analysis.ICUCollationKeyFilter; import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory; import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; @@ -81,6 +83,18 @@ public class IcuIndicesAnalysis extends AbstractComponent { } })); + indicesAnalysisService.tokenFilterFactories().put("icu_collation", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override + public String name() { + return "icu_collation"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ICUCollationKeyFilter(tokenStream, Collator.getInstance()); + } + })); + indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { diff --git a/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java b/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java index d8a13b0febe..95874c98b07 100644 --- a/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java @@ -52,8 +52,10 @@ public class ICUIntegrationTests extends ElasticsearchIntegrationTest { Settings settings = ImmutableSettings.builder() .put(super.indexSettings()) .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") - .putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "my_folding") - .put("index.analysis.filter.my_folding.type", "icu_folding") + .putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_collator") + .put("index.analysis.filter.my_collator.type", "icu_collation") + .put("index.analysis.filter.my_collator.language", "en") + .put("index.analysis.filter.my_collator.strength", "primary") .build(); return settings; diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java new file mode 100644 index 00000000000..2ac85a4a5d2 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java @@ -0,0 +1,255 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.util.ULocale; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.test.ElasticsearchTestCase; +import org.junit.Test; + +import java.io.IOException; +import java.io.StringReader; + +import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService; +import static org.hamcrest.Matchers.equalTo; + +// Tests borrowed from Solr's Icu collation key filter factory test. +public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { + + /* + * Turkish has some funny casing. + * This test shows how you can solve this kind of thing easily with collation. + * Instead of using LowerCaseFilter, use a turkish collator with primary strength. + * Then things will sort and match correctly. + */ + @Test + public void testBasicUsage() throws Exception { + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "tr") + .put("index.analysis.filter.myCollator.strength", "primary") + .build(); + AnalysisService analysisService = createAnalysisService(settings); + + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + assertCollatesToSame(filterFactory, "I WİLL USE TURKİSH CASING", "ı will use turkish casıng"); + } + + /* + * Test usage of the decomposition option for unicode normalization. + */ + @Test + public void testNormalization() throws IOException { + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "tr") + .put("index.analysis.filter.myCollator.strength", "primary") + .put("index.analysis.filter.myCollator.decomposition", "canonical") + .build(); + AnalysisService analysisService = createAnalysisService(settings); + + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + assertCollatesToSame(filterFactory, "I W\u0049\u0307LL USE TURKİSH CASING", "ı will use turkish casıng"); + } + + /* + * Test secondary strength, for english case is not significant. + */ + @Test + public void testSecondaryStrength() throws IOException { + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "secondary") + .put("index.analysis.filter.myCollator.decomposition", "no") + .build(); + AnalysisService analysisService = createAnalysisService(settings); + + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + assertCollatesToSame(filterFactory, "TESTING", "testing"); + } + + /* + * Setting alternate=shifted to shift whitespace, punctuation and symbols + * to quaternary level + */ + @Test + public void testIgnorePunctuation() throws IOException { + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "primary") + .put("index.analysis.filter.myCollator.alternate", "shifted") + .build(); + AnalysisService analysisService = createAnalysisService(settings); + + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + assertCollatesToSame(filterFactory, "foo-bar", "foo bar"); + } + + /* + * Setting alternate=shifted and variableTop to shift whitespace, but not + * punctuation or symbols, to quaternary level + */ + @Test + public void testIgnoreWhitespace() throws IOException { + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "primary") + .put("index.analysis.filter.myCollator.alternate", "shifted") + .put("index.analysis.filter.myCollator.variableTop", " ") + .build(); + AnalysisService analysisService = createAnalysisService(settings); + + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + assertCollatesToSame(filterFactory, "foo bar", "foobar"); + // now assert that punctuation still matters: foo-bar < foo bar + assertCollation(filterFactory, "foo-bar", "foo bar", -1); + } + + /* + * Setting numeric to encode digits with numeric value, so that + * foobar-9 sorts before foobar-10 + */ + @Test + public void testNumerics() throws IOException { + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.numeric", "true") + .build(); + AnalysisService analysisService = createAnalysisService(settings); + + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + assertCollation(filterFactory, "foobar-9", "foobar-10", -1); + } + + /* + * Setting caseLevel=true to create an additional case level between + * secondary and tertiary + */ + @Test + public void testIgnoreAccentsButNotCase() throws IOException { + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "primary") + .put("index.analysis.filter.myCollator.caseLevel", "true") + .build(); + AnalysisService analysisService = createAnalysisService(settings); + + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + assertCollatesToSame(filterFactory, "résumé", "resume"); + assertCollatesToSame(filterFactory, "Résumé", "Resume"); + // now assert that case still matters: resume < Resume + assertCollation(filterFactory, "resume", "Resume", -1); + } + + /* + * Setting caseFirst=upper to cause uppercase strings to sort + * before lowercase ones. + */ + @Test + public void testUpperCaseFirst() throws IOException { + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "tertiary") + .put("index.analysis.filter.myCollator.caseFirst", "upper") + .build(); + AnalysisService analysisService = createAnalysisService(settings); + + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + assertCollation(filterFactory, "Resume", "resume", -1); + } + + /* + * For german, you might want oe to sort and match with o umlaut. + * This is not the default, but you can make a customized ruleset to do this. + * + * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior. + * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383 + */ + @Test + public void testCustomRules() throws Exception { + RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE")); + String DIN5007_2_tailorings = + "& ae , a\u0308 & AE , A\u0308"+ + "& oe , o\u0308 & OE , O\u0308"+ + "& ue , u\u0308 & UE , u\u0308"; + + RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); + String tailoredRules = tailoredCollator.getRules(); + + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.rules", tailoredRules) + .put("index.analysis.filter.myCollator.strength", "primary") + .build(); + AnalysisService analysisService = createAnalysisService(settings); + + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + assertCollatesToSame(filterFactory, "Töne", "Toene"); + } + + private void assertCollatesToSame(TokenFilterFactory factory, String string1, String string2) throws IOException { + assertCollation(factory, string1, string2, 0); + } + + private void assertCollation(TokenFilterFactory factory, String string1, String string2, int comparison) throws IOException { + Tokenizer tokenizer = new KeywordTokenizer(); + tokenizer.setReader(new StringReader(string1)); + TokenStream stream1 = factory.create(tokenizer); + + tokenizer = new KeywordTokenizer(); + tokenizer.setReader(new StringReader(string2)); + TokenStream stream2 = factory.create(tokenizer); + + assertCollation(stream1, stream2, comparison); + } + + private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException { + CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class); + CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class); + + stream1.reset(); + stream2.reset(); + + assertThat(stream1.incrementToken(), equalTo(true)); + assertThat(stream2.incrementToken(), equalTo(true)); + assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison))); + assertThat(stream1.incrementToken(), equalTo(false)); + assertThat(stream2.incrementToken(), equalTo(false)); + + stream1.end(); + stream2.end(); + + stream1.close(); + stream2.close(); + } +} diff --git a/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java b/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java new file mode 100644 index 00000000000..5710a90df0e --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java @@ -0,0 +1,251 @@ +package org.elasticsearch.index.analysis; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.TimeUnits; +import org.elasticsearch.test.ElasticsearchThreadFilter; +import org.elasticsearch.test.junit.listeners.ReproduceInfoPrinter; +import org.junit.BeforeClass; + +import com.carrotsearch.randomizedtesting.annotations.Listeners; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope; +import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope; + +import java.util.Locale; + +/** + * @deprecated Remove when IndexableBinaryStringTools is removed. + */ +@Deprecated +@Listeners({ + ReproduceInfoPrinter.class +}) +@ThreadLeakFilters(defaultFilters = true, filters = {ElasticsearchThreadFilter.class}) +@ThreadLeakScope(Scope.NONE) +@TimeoutSuite(millis = TimeUnits.HOUR) +@LuceneTestCase.SuppressSysoutChecks(bugUrl = "we log a lot on purpose") +public class TestIndexableBinaryStringTools extends LuceneTestCase { + private static int NUM_RANDOM_TESTS; + private static int MAX_RANDOM_BINARY_LENGTH; + + @BeforeClass + public static void beforeClass() throws Exception { + NUM_RANDOM_TESTS = atLeast(200); + MAX_RANDOM_BINARY_LENGTH = atLeast(300); + } + + public void testSingleBinaryRoundTrip() { + byte[] binary = new byte[] { (byte) 0x23, (byte) 0x98, (byte) 0x13, + (byte) 0xE4, (byte) 0x76, (byte) 0x41, (byte) 0xB2, (byte) 0xC9, + (byte) 0x7F, (byte) 0x0A, (byte) 0xA6, (byte) 0xD8 }; + + int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0, + binary.length); + char encoded[] = new char[encodedLen]; + IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0, + encoded.length); + + int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0, + encoded.length); + byte decoded[] = new byte[decodedLen]; + IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0, + decoded.length); + + assertEquals("Round trip decode/decode returned different results:" + + System.getProperty("line.separator") + "original: " + + binaryDump(binary, binary.length) + + System.getProperty("line.separator") + " encoded: " + + charArrayDump(encoded, encoded.length) + + System.getProperty("line.separator") + " decoded: " + + binaryDump(decoded, decoded.length), + binaryDump(binary, binary.length), binaryDump(decoded, decoded.length)); + } + + public void testEncodedSortability() { + byte[] originalArray1 = new byte[MAX_RANDOM_BINARY_LENGTH]; + char[] originalString1 = new char[MAX_RANDOM_BINARY_LENGTH]; + char[] encoded1 = new char[MAX_RANDOM_BINARY_LENGTH * 10]; + byte[] original2 = new byte[MAX_RANDOM_BINARY_LENGTH]; + char[] originalString2 = new char[MAX_RANDOM_BINARY_LENGTH]; + char[] encoded2 = new char[MAX_RANDOM_BINARY_LENGTH * 10]; + + for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) { + int numBytes1 = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1 + + for (int byteNum = 0; byteNum < numBytes1; ++byteNum) { + int randomInt = random().nextInt(0x100); + originalArray1[byteNum] = (byte) randomInt; + originalString1[byteNum] = (char) randomInt; + } + + int numBytes2 = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1 + + for (int byteNum = 0; byteNum < numBytes2; ++byteNum) { + int randomInt = random().nextInt(0x100); + original2[byteNum] = (byte) randomInt; + originalString2[byteNum] = (char) randomInt; + } + int originalComparison = new String(originalString1, 0, numBytes1) + .compareTo(new String(originalString2, 0, numBytes2)); + originalComparison = originalComparison < 0 ? -1 + : originalComparison > 0 ? 1 : 0; + + int encodedLen1 = IndexableBinaryStringTools.getEncodedLength( + originalArray1, 0, numBytes1); + if (encodedLen1 > encoded1.length) + encoded1 = new char[ArrayUtil.oversize(encodedLen1, RamUsageEstimator.NUM_BYTES_CHAR)]; + IndexableBinaryStringTools.encode(originalArray1, 0, numBytes1, encoded1, + 0, encodedLen1); + + int encodedLen2 = IndexableBinaryStringTools.getEncodedLength(original2, + 0, numBytes2); + if (encodedLen2 > encoded2.length) + encoded2 = new char[ArrayUtil.oversize(encodedLen2, RamUsageEstimator.NUM_BYTES_CHAR)]; + IndexableBinaryStringTools.encode(original2, 0, numBytes2, encoded2, 0, + encodedLen2); + + int encodedComparison = new String(encoded1, 0, encodedLen1) + .compareTo(new String(encoded2, 0, encodedLen2)); + encodedComparison = encodedComparison < 0 ? -1 + : encodedComparison > 0 ? 1 : 0; + + assertEquals("Test #" + (testNum + 1) + + ": Original bytes and encoded chars compare differently:" + + System.getProperty("line.separator") + " binary 1: " + + binaryDump(originalArray1, numBytes1) + + System.getProperty("line.separator") + " binary 2: " + + binaryDump(original2, numBytes2) + + System.getProperty("line.separator") + "encoded 1: " + + charArrayDump(encoded1, encodedLen1) + + System.getProperty("line.separator") + "encoded 2: " + + charArrayDump(encoded2, encodedLen2) + + System.getProperty("line.separator"), originalComparison, + encodedComparison); + } + } + + public void testEmptyInput() { + byte[] binary = new byte[0]; + + int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0, + binary.length); + char[] encoded = new char[encodedLen]; + IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0, + encoded.length); + + int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0, + encoded.length); + byte[] decoded = new byte[decodedLen]; + IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0, + decoded.length); + + assertEquals("decoded empty input was not empty", decoded.length, 0); + } + + public void testAllNullInput() { + byte[] binary = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0, + binary.length); + char encoded[] = new char[encodedLen]; + IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0, + encoded.length); + + int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0, + encoded.length); + byte[] decoded = new byte[decodedLen]; + IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0, + decoded.length); + + assertEquals("Round trip decode/decode returned different results:" + + System.getProperty("line.separator") + " original: " + + binaryDump(binary, binary.length) + + System.getProperty("line.separator") + "decodedBuf: " + + binaryDump(decoded, decoded.length), + binaryDump(binary, binary.length), binaryDump(decoded, decoded.length)); + } + + public void testRandomBinaryRoundTrip() { + byte[] binary = new byte[MAX_RANDOM_BINARY_LENGTH]; + char[] encoded = new char[MAX_RANDOM_BINARY_LENGTH * 10]; + byte[] decoded = new byte[MAX_RANDOM_BINARY_LENGTH]; + for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) { + int numBytes = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1 + + for (int byteNum = 0; byteNum < numBytes; ++byteNum) { + binary[byteNum] = (byte) random().nextInt(0x100); + } + + int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0, + numBytes); + if (encoded.length < encodedLen) + encoded = new char[ArrayUtil.oversize(encodedLen, RamUsageEstimator.NUM_BYTES_CHAR)]; + IndexableBinaryStringTools.encode(binary, 0, numBytes, encoded, 0, + encodedLen); + + int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0, + encodedLen); + IndexableBinaryStringTools.decode(encoded, 0, encodedLen, decoded, 0, + decodedLen); + + assertEquals("Test #" + (testNum + 1) + + ": Round trip decode/decode returned different results:" + + System.getProperty("line.separator") + " original: " + + binaryDump(binary, numBytes) + System.getProperty("line.separator") + + "encodedBuf: " + charArrayDump(encoded, encodedLen) + + System.getProperty("line.separator") + "decodedBuf: " + + binaryDump(decoded, decodedLen), binaryDump(binary, numBytes), + binaryDump(decoded, decodedLen)); + } + } + + public String binaryDump(byte[] binary, int numBytes) { + StringBuilder buf = new StringBuilder(); + for (int byteNum = 0 ; byteNum < numBytes ; ++byteNum) { + String hex = Integer.toHexString(binary[byteNum] & 0xFF); + if (hex.length() == 1) { + buf.append('0'); + } + buf.append(hex.toUpperCase(Locale.ROOT)); + if (byteNum < numBytes - 1) { + buf.append(' '); + } + } + return buf.toString(); + } + + public String charArrayDump(char[] charArray, int numBytes) { + StringBuilder buf = new StringBuilder(); + for (int charNum = 0 ; charNum < numBytes ; ++charNum) { + String hex = Integer.toHexString(charArray[charNum]); + for (int digit = 0 ; digit < 4 - hex.length() ; ++digit) { + buf.append('0'); + } + buf.append(hex.toUpperCase(Locale.ROOT)); + if (charNum < numBytes - 1) { + buf.append(' '); + } + } + return buf.toString(); + } +} From f50c01db1ab47b59b1f6b72286f5188594db14e2 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 6 Nov 2014 03:15:42 -0500 Subject: [PATCH 099/131] actually run all the tests --- pom.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/pom.xml b/pom.xml index 938ddef943e..6b44bb7e431 100644 --- a/pom.xml +++ b/pom.xml @@ -161,6 +161,7 @@ + **/Test*.class **/*Tests.class **/*Test.class From 121513dd59a09916876e53057a90787352f02e70 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Mon, 10 Nov 2014 16:45:17 -0500 Subject: [PATCH 100/131] Upgrade to Lucene 5.0.0-snapshot-1637347 --- pom.xml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index 6b44bb7e431..c801d1c86e9 100644 --- a/pom.xml +++ b/pom.xml @@ -34,7 +34,7 @@ 2.0.0-SNAPSHOT 5.0.0 - 5.0.0-snapshot-1636426 + 5.0.0-snapshot-1637347 1 true onerror @@ -44,12 +44,12 @@ - sonatype - http://oss.sonatype.org/content/repositories/releases/ + Lucene snapshots + https://download.elasticsearch.org/lucenesnapshots/1637347/ - Lucene snapshots - https://download.elasticsearch.org/lucenesnapshots/maven/ + sonatype + http://oss.sonatype.org/content/repositories/releases/ From 0f49ccdcdee7f61db66743ed9d3374e0a1be068b Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Mon, 24 Nov 2014 05:51:19 -0500 Subject: [PATCH 101/131] Upgrade to Lucene 5.0.0-snapshot-1641343 --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index c801d1c86e9..6d43bcfcfab 100644 --- a/pom.xml +++ b/pom.xml @@ -34,7 +34,7 @@ 2.0.0-SNAPSHOT 5.0.0 - 5.0.0-snapshot-1637347 + 5.0.0-snapshot-1641343 1 true onerror @@ -45,7 +45,7 @@ Lucene snapshots - https://download.elasticsearch.org/lucenesnapshots/1637347/ + https://download.elasticsearch.org/lucenesnapshots/1641343/ sonatype From cda7537967a62df02621a6cd43bc1ffff3d7ef63 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 2 Dec 2014 18:15:00 +0100 Subject: [PATCH 102/131] Upgrade to Lucene 5.0.0-snapshot-1642891 --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 6d43bcfcfab..984ec3a51f0 100644 --- a/pom.xml +++ b/pom.xml @@ -34,7 +34,7 @@ 2.0.0-SNAPSHOT 5.0.0 - 5.0.0-snapshot-1641343 + 5.0.0-snapshot-1642891 1 true onerror @@ -45,7 +45,7 @@ Lucene snapshots - https://download.elasticsearch.org/lucenesnapshots/1641343/ + https://download.elasticsearch.org/lucenesnapshots/1642891/ sonatype From e50f8349e0eca83a31351e2771ab0365688052a2 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 3 Dec 2014 21:13:51 +0100 Subject: [PATCH 103/131] Replace resolveConfigAndLoadToString This method was removed by https://github.com/elasticsearch/elasticsearch/commit/3dfff8404362287e16466b37998ea0965eb1eacd#diff-f06b176696959d1967c63d5b74fd58ac in elasticsearch master branch --- .../index/analysis/IcuCollationTokenFilterFactory.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java index 489deafe313..81c7d187062 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -24,8 +24,10 @@ import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; import org.apache.lucene.analysis.TokenStream; import org.elasticsearch.ElasticsearchIllegalArgumentException; +import org.elasticsearch.common.base.Charsets; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.io.Streams; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.env.FailedToResolveConfigException; @@ -33,6 +35,9 @@ import org.elasticsearch.index.Index; import org.elasticsearch.index.settings.IndexSettings; import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Paths; /** * An ICU based collation token filter. There are two ways to configure collation: @@ -58,11 +63,13 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { if (rules != null) { FailedToResolveConfigException failureToResolve = null; try { - rules = environment.resolveConfigAndLoadToString(rules); + rules = Streams.copyToString(Files.newBufferedReader(Paths.get(environment.resolveConfig(rules).toURI()), Charsets.UTF_8)); } catch (FailedToResolveConfigException e) { failureToResolve = e; } catch (IOException e) { throw new ElasticsearchIllegalArgumentException("Failed to load collation rules", e); + } catch (URISyntaxException e) { + throw new ElasticsearchIllegalArgumentException("Failed to load collation rules", e); } try { collator = new RuleBasedCollator(rules); From 8204be34e3e9a69ec530f0e1f7b6374067a3c053 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 12 Dec 2014 16:23:28 +0100 Subject: [PATCH 104/131] Depend on elasticsearch-parent To simplify plugins maintenance and provide more value in the future, we are starting to build an `elasticsearch-parent` project. This PR is the first step for this plugin to depend on this new `pom` maven project. --- dev-tools/tests.policy | 54 ++++++++++++++++ pom.xml | 139 ++--------------------------------------- 2 files changed, 58 insertions(+), 135 deletions(-) create mode 100644 dev-tools/tests.policy diff --git a/dev-tools/tests.policy b/dev-tools/tests.policy new file mode 100644 index 00000000000..6afb5025840 --- /dev/null +++ b/dev-tools/tests.policy @@ -0,0 +1,54 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Policy file to prevent tests from writing outside the test sandbox directory +// PLEASE NOTE: You may need to enable other permissions when new tests are added, +// everything not allowed here is forbidden! + +grant { + // permissions for file access, write access only to sandbox: + permission java.io.FilePermission "<>", "read,execute"; + permission java.io.FilePermission "${junit4.childvm.cwd}", "read,execute,write"; + permission java.io.FilePermission "${junit4.childvm.cwd}${/}-", "read,execute,write,delete"; + permission java.io.FilePermission "${junit4.tempDir}${/}*", "read,execute,write,delete"; + permission groovy.security.GroovyCodeSourcePermission "/groovy/script"; + + // Allow connecting to the internet anywhere + permission java.net.SocketPermission "*", "accept,listen,connect,resolve"; + + // Basic permissions needed for Lucene / Elasticsearch to work: + permission java.util.PropertyPermission "*", "read,write"; + permission java.lang.reflect.ReflectPermission "*"; + permission java.lang.RuntimePermission "*"; + + // These two *have* to be spelled out a separate + permission java.lang.management.ManagementPermission "control"; + permission java.lang.management.ManagementPermission "monitor"; + + permission java.net.NetPermission "*"; + permission java.util.logging.LoggingPermission "control"; + permission javax.management.MBeanPermission "*", "*"; + permission javax.management.MBeanServerPermission "*"; + permission javax.management.MBeanTrustPermission "*"; + + // Needed for some things in DNS caching in the JVM + permission java.security.SecurityPermission "getProperty.networkaddress.cache.ttl"; + permission java.security.SecurityPermission "getProperty.networkaddress.cache.negative.ttl"; + +}; diff --git a/pom.xml b/pom.xml index 984ec3a51f0..67e6ffa4f29 100644 --- a/pom.xml +++ b/pom.xml @@ -26,80 +26,49 @@ - org.sonatype.oss - oss-parent - 7 + org.elasticsearch + elasticsearch-parent + 2.0.0-SNAPSHOT - 2.0.0-SNAPSHOT - 5.0.0 - 5.0.0-snapshot-1642891 1 - true - onerror - INFO - - - Lucene snapshots - https://download.elasticsearch.org/lucenesnapshots/1642891/ - - - sonatype - http://oss.sonatype.org/content/repositories/releases/ - - - org.hamcrest hamcrest-all - 1.3 - test com.carrotsearch.randomizedtesting randomizedtesting-runner - 2.1.10 - test org.apache.lucene lucene-test-framework - ${lucene.maven.version} - test org.elasticsearch elasticsearch - ${elasticsearch.version} - compile org.apache.lucene lucene-analyzers-icu - ${lucene.maven.version} - compile log4j log4j - 1.2.17 - runtime org.elasticsearch elasticsearch - ${elasticsearch.version} test-jar - test @@ -115,122 +84,23 @@ org.apache.maven.plugins maven-compiler-plugin - 2.3.2 - - 1.7 - 1.7 - com.carrotsearch.randomizedtesting junit4-maven-plugin - 2.0.12 - - - tests - test - - junit4 - - - 20 - pipe,warn - true - - - - - - - - - ${tests.jvms} - - - - - - - **/Test*.class - **/*Tests.class - **/*Test.class - - - **/Abstract*.class - **/*StressTest.class - - - -Xmx512m - -XX:MaxDirectMemorySize=512m - -Des.logger.prefix= - - ${tests.shuffle} - ${tests.verbose} - ${tests.seed} - ${tests.failfast} - - - ${tests.iters} - ${tests.maxfailures} - ${tests.failfast} - ${tests.class} - ${tests.method} - ${tests.nightly} - ${tests.badapples} - ${tests.weekly} - ${tests.slow} - ${tests.awaitsfix} - ${tests.slow} - ${tests.timeoutSuite} - ${tests.showSuccess} - ${tests.integration} - ${tests.cluster_seed} - ${tests.client.ratio} - ${env.ES_TEST_LOCAL} - ${es.node.mode} - ${es.logger.level} - true - - - - - org.apache.maven.plugins maven-surefire-plugin - 2.15 - - true - org.apache.maven.plugins maven-source-plugin - 2.1.2 - - - attach-sources - - jar - - - + org.apache.maven.plugins maven-assembly-plugin - 2.3 false ${project.build.directory}/releases/ @@ -250,4 +120,3 @@ - From aa162da27c631a692c3ebf3463817ded30bb43ac Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 12 Dec 2014 18:11:43 +0100 Subject: [PATCH 105/131] add /.local-*-execution-hints.log --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a99aad1be2e..e432ab87145 100644 --- a/.gitignore +++ b/.gitignore @@ -8,5 +8,6 @@ /.settings /.classpath /.project -/.local-execution-hints.log /plugin_tools +/.local-execution-hints.log +/.local-*-execution-hints.log From 8fecb4d508fec2ca071b2448c1e8af787a36c929 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 2 Jan 2015 20:35:26 +0100 Subject: [PATCH 106/131] Add sonatype snapshot repository --- pom.xml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pom.xml b/pom.xml index 67e6ffa4f29..416a8f2d9ec 100644 --- a/pom.xml +++ b/pom.xml @@ -119,4 +119,12 @@ + + + + oss-snapshots + Sonatype OSS Snapshots + https://oss.sonatype.org/content/repositories/snapshots/ + + From a0b9cb2373f05e771cd434d1e20715664a7147d5 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 11 Feb 2015 21:40:42 +0100 Subject: [PATCH 107/131] update documentation with release 2.4.2 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index afa9150215e..20973346da5 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: ```sh -bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.4.1 +bin/plugin install elasticsearch/elasticsearch-analysis-icu/2.4.2 ``` You need to install a version matching your Elasticsearch version: @@ -15,7 +15,7 @@ You need to install a version matching your Elasticsearch version: |---------------|-----------------------|------------| | master | Build from source | See below | | es-1.x | Build from source | [2.5.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.x/#version-250-snapshot-for-elasticsearch-1x) | -| es-1.4 | 2.4.1 | [2.4.1](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.1/#version-241-for-elasticsearch-14) | +| es-1.4 | 2.4.2 | [2.4.2](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.2/#version-242-for-elasticsearch-14) | | es-1.3 | 2.3.0 | [2.3.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.3.0/#icu-analysis-for-elasticsearch) | | es-1.2 | 2.2.0 | [2.2.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.2.0/#icu-analysis-for-elasticsearch) | | es-1.1 | 2.1.0 | [2.1.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.1.0/#icu-analysis-for-elasticsearch) | From aff0c5fc9f50e10cb9703de40017a6fd20eaf756 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 13 Feb 2015 16:41:45 +0100 Subject: [PATCH 108/131] Fix doc for es version < 1.4.3 --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 20973346da5..2b2129948a9 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ You need to install a version matching your Elasticsearch version: | master | Build from source | See below | | es-1.x | Build from source | [2.5.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.x/#version-250-snapshot-for-elasticsearch-1x) | | es-1.4 | 2.4.2 | [2.4.2](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.2/#version-242-for-elasticsearch-14) | +| < 1.4.3 | 2.4.1 | [2.4.1](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.1/#version-241-for-elasticsearch-14) | | es-1.3 | 2.3.0 | [2.3.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.3.0/#icu-analysis-for-elasticsearch) | | es-1.2 | 2.2.0 | [2.2.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.2.0/#icu-analysis-for-elasticsearch) | | es-1.1 | 2.1.0 | [2.1.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.1.0/#icu-analysis-for-elasticsearch) | From eac332ae07d1fc1c4c4bcb24b497164d214b6c52 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 27 Feb 2015 09:19:25 +0100 Subject: [PATCH 109/131] Remove ThreadLeak filter Due to elasticsearch/elasticsearch#9843 --- .../analysis/TestIndexableBinaryStringTools.java | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java b/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java index 5710a90df0e..ff6121f86e4 100644 --- a/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java +++ b/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java @@ -17,20 +17,17 @@ package org.elasticsearch.index.analysis; * limitations under the License. */ +import com.carrotsearch.randomizedtesting.annotations.Listeners; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope; +import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.TimeUnits; -import org.elasticsearch.test.ElasticsearchThreadFilter; import org.elasticsearch.test.junit.listeners.ReproduceInfoPrinter; import org.junit.BeforeClass; -import com.carrotsearch.randomizedtesting.annotations.Listeners; -import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; -import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope; -import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; -import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope; - import java.util.Locale; /** @@ -40,7 +37,6 @@ import java.util.Locale; @Listeners({ ReproduceInfoPrinter.class }) -@ThreadLeakFilters(defaultFilters = true, filters = {ElasticsearchThreadFilter.class}) @ThreadLeakScope(Scope.NONE) @TimeoutSuite(millis = TimeUnits.HOUR) @LuceneTestCase.SuppressSysoutChecks(bugUrl = "we log a lot on purpose") From bd6976dfa81f1bd918b93625e3dd493ea4921084 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Mon, 16 Mar 2015 16:21:53 -0700 Subject: [PATCH 110/131] create branch `es-1.5` --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2b2129948a9..11891e6368e 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,8 @@ You need to install a version matching your Elasticsearch version: | elasticsearch | ICU Analysis Plugin | Docs | |---------------|-----------------------|------------| | master | Build from source | See below | -| es-1.x | Build from source | [2.5.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.x/#version-250-snapshot-for-elasticsearch-1x) | +| es-1.x | Build from source | [2.6.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.x/#version-260-snapshot-for-elasticsearch-1x) | +| es-1.5 | Build from source | [2.5.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.5/#version-250-snapshot-for-elasticsearch-15) | | es-1.4 | 2.4.2 | [2.4.2](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.2/#version-242-for-elasticsearch-14) | | < 1.4.3 | 2.4.1 | [2.4.1](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.1/#version-241-for-elasticsearch-14) | | es-1.3 | 2.3.0 | [2.3.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.3.0/#icu-analysis-for-elasticsearch) | From 6e25fc2cb25f943de1d57c4d8d54d7da7e0b52e2 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Tue, 24 Mar 2015 17:40:07 +0100 Subject: [PATCH 111/131] update documentation with release 2.5.0 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 11891e6368e..d8931bf10d6 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: ```sh -bin/plugin install elasticsearch/elasticsearch-analysis-icu/2.4.2 +bin/plugin install elasticsearch/elasticsearch-analysis-icu/2.5.0 ``` You need to install a version matching your Elasticsearch version: @@ -15,7 +15,7 @@ You need to install a version matching your Elasticsearch version: |---------------|-----------------------|------------| | master | Build from source | See below | | es-1.x | Build from source | [2.6.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.x/#version-260-snapshot-for-elasticsearch-1x) | -| es-1.5 | Build from source | [2.5.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.5/#version-250-snapshot-for-elasticsearch-15) | +| es-1.5 | 2.5.0 | [2.5.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.5.0/#version-250-for-elasticsearch-15) | | es-1.4 | 2.4.2 | [2.4.2](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.2/#version-242-for-elasticsearch-14) | | < 1.4.3 | 2.4.1 | [2.4.1](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.1/#version-241-for-elasticsearch-14) | | es-1.3 | 2.3.0 | [2.3.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.3.0/#icu-analysis-for-elasticsearch) | From 94b770b637876fa45d3dcc3e26f9f9eb390cf7a0 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Tue, 24 Mar 2015 17:49:18 +0100 Subject: [PATCH 112/131] Update to elastic owner We moved elasticsearch to elastic (cherry picked from commit 500f265) (cherry picked from commit 583c5b2) --- pom.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 416a8f2d9ec..be4d6ee02b8 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ jar Elasticsearch ICU Analysis plugin The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. - https://github.com/elasticsearch/elasticsearch-analysis-icu/ + https://github.com/elastic/elasticsearch-analysis-icu/ 2009 @@ -19,10 +19,10 @@ - scm:git:git@github.com:elasticsearch/elasticsearch-analysis-icu.git - scm:git:git@github.com:elasticsearch/elasticsearch-analysis-icu.git + scm:git:git@github.com:elastic/elasticsearch-analysis-icu.git + scm:git:git@github.com:elastic/elasticsearch-analysis-icu.git - http://github.com/elasticsearch/elasticsearch-analysis-icu + http://github.com/elastic/elasticsearch-analysis-icu From 6b7b7e374ae325ea1bac73114669709f7bdde4db Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 29 Apr 2015 18:55:48 +0200 Subject: [PATCH 113/131] Remove `ElasticsearchIllegalArgumentException` and `ElasticsearchIllegalStateException` in favor of the JDK one Related to https://github.com/elastic/elasticsearch/issues/10794 Closes #50. --- .../IcuCollationTokenFilterFactory.java | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java index 81c7d187062..b250dc3c56e 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -23,7 +23,6 @@ import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; import org.apache.lucene.analysis.TokenStream; -import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.common.base.Charsets; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; @@ -67,17 +66,17 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { } catch (FailedToResolveConfigException e) { failureToResolve = e; } catch (IOException e) { - throw new ElasticsearchIllegalArgumentException("Failed to load collation rules", e); + throw new IllegalArgumentException("Failed to load collation rules", e); } catch (URISyntaxException e) { - throw new ElasticsearchIllegalArgumentException("Failed to load collation rules", e); + throw new IllegalArgumentException("Failed to load collation rules", e); } try { collator = new RuleBasedCollator(rules); } catch (Exception e) { if (failureToResolve != null) { - throw new ElasticsearchIllegalArgumentException("Failed to resolve collation rules location", failureToResolve); + throw new IllegalArgumentException("Failed to resolve collation rules location", failureToResolve); } else { - throw new ElasticsearchIllegalArgumentException("Failed to parse collation rules", e); + throw new IllegalArgumentException("Failed to parse collation rules", e); } } } else { @@ -115,7 +114,7 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { } else if (strength.equalsIgnoreCase("identical")) { collator.setStrength(Collator.IDENTICAL); } else { - throw new ElasticsearchIllegalArgumentException("Invalid strength: " + strength); + throw new IllegalArgumentException("Invalid strength: " + strength); } } @@ -127,7 +126,7 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { } else if (decomposition.equalsIgnoreCase("canonical")) { collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); } else { - throw new ElasticsearchIllegalArgumentException("Invalid decomposition: " + decomposition); + throw new IllegalArgumentException("Invalid decomposition: " + decomposition); } } @@ -140,7 +139,7 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { } else if (alternate.equalsIgnoreCase("non-ignorable")) { rbc.setAlternateHandlingShifted(false); } else { - throw new ElasticsearchIllegalArgumentException("Invalid alternate: " + alternate); + throw new IllegalArgumentException("Invalid alternate: " + alternate); } } @@ -156,7 +155,7 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { } else if (caseFirst.equalsIgnoreCase("upper")) { rbc.setUpperCaseFirst(true); } else { - throw new ElasticsearchIllegalArgumentException("Invalid caseFirst: " + caseFirst); + throw new IllegalArgumentException("Invalid caseFirst: " + caseFirst); } } From 3d8d8ff6eeef4645af789c0fc64f2854d55a0015 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 30 Apr 2015 15:56:07 +0200 Subject: [PATCH 114/131] Update compatibility matrix for elasticsearch 1.4.5 Closes #51. --- README.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index d8931bf10d6..50735972b8e 100644 --- a/README.md +++ b/README.md @@ -14,15 +14,16 @@ You need to install a version matching your Elasticsearch version: | elasticsearch | ICU Analysis Plugin | Docs | |---------------|-----------------------|------------| | master | Build from source | See below | -| es-1.x | Build from source | [2.6.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/es-1.x/#version-260-snapshot-for-elasticsearch-1x) | -| es-1.5 | 2.5.0 | [2.5.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.5.0/#version-250-for-elasticsearch-15) | -| es-1.4 | 2.4.2 | [2.4.2](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.2/#version-242-for-elasticsearch-14) | -| < 1.4.3 | 2.4.1 | [2.4.1](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.1/#version-241-for-elasticsearch-14) | -| es-1.3 | 2.3.0 | [2.3.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.3.0/#icu-analysis-for-elasticsearch) | -| es-1.2 | 2.2.0 | [2.2.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.2.0/#icu-analysis-for-elasticsearch) | -| es-1.1 | 2.1.0 | [2.1.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.1.0/#icu-analysis-for-elasticsearch) | -| es-1.0 | 2.0.0 | [2.0.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.0.0/#icu-analysis-for-elasticsearch) | -| es-0.90 | 1.13.0 | [1.13.0](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v1.13.0/#icu-analysis-for-elasticsearch) | +| es-1.x | Build from source | [2.6.0-SNAPSHOT](https://github.com/elastic/elasticsearch-analysis-icu/tree/es-1.x/#version-260-snapshot-for-elasticsearch-1x) | +| es-1.5 | 2.5.0 | [2.5.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.5.0/#version-250-for-elasticsearch-15) | +| es-1.4 | Build from source | [2.4.3-SNAPSHOT](https://github.com/elastic/elasticsearch-analysis-icu/tree/es-1.4/#version-243-snapshot-for-elasticsearch-14) | +| < 1.4.5 | 2.4.2 | [2.4.2](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.4.2/#version-242-for-elasticsearch-14) | +| < 1.4.3 | 2.4.1 | [2.4.1](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.4.1/#version-241-for-elasticsearch-14) | +| es-1.3 | 2.3.0 | [2.3.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.3.0/#icu-analysis-for-elasticsearch) | +| es-1.2 | 2.2.0 | [2.2.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.2.0/#icu-analysis-for-elasticsearch) | +| es-1.1 | 2.1.0 | [2.1.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.1.0/#icu-analysis-for-elasticsearch) | +| es-1.0 | 2.0.0 | [2.0.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.0.0/#icu-analysis-for-elasticsearch) | +| es-0.90 | 1.13.0 | [1.13.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v1.13.0/#icu-analysis-for-elasticsearch) | To build a `SNAPSHOT` version, you need to build it with Maven: From 2f5571dee174ae4bccec30464547b0c1095cf345 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 30 Apr 2015 16:04:15 +0200 Subject: [PATCH 115/131] update documentation with release 2.4.3 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 50735972b8e..5cc688ad6c2 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: ```sh -bin/plugin install elasticsearch/elasticsearch-analysis-icu/2.5.0 +bin/plugin install elasticsearch/elasticsearch-analysis-icu/2.4.3 ``` You need to install a version matching your Elasticsearch version: @@ -16,7 +16,7 @@ You need to install a version matching your Elasticsearch version: | master | Build from source | See below | | es-1.x | Build from source | [2.6.0-SNAPSHOT](https://github.com/elastic/elasticsearch-analysis-icu/tree/es-1.x/#version-260-snapshot-for-elasticsearch-1x) | | es-1.5 | 2.5.0 | [2.5.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.5.0/#version-250-for-elasticsearch-15) | -| es-1.4 | Build from source | [2.4.3-SNAPSHOT](https://github.com/elastic/elasticsearch-analysis-icu/tree/es-1.4/#version-243-snapshot-for-elasticsearch-14) | +| es-1.4 | 2.4.3 | [2.4.3](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.3/#version-243-for-elasticsearch-14) | | < 1.4.5 | 2.4.2 | [2.4.2](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.4.2/#version-242-for-elasticsearch-14) | | < 1.4.3 | 2.4.1 | [2.4.1](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.4.1/#version-241-for-elasticsearch-14) | | es-1.3 | 2.3.0 | [2.3.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.3.0/#icu-analysis-for-elasticsearch) | From e2d75f3e07951e0c6bf253c2a07fb76b93bbc86d Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 30 Apr 2015 16:10:56 +0200 Subject: [PATCH 116/131] Latest version is 2.5.0 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5cc688ad6c2..95d955980d4 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding In order to install the plugin, simply run: ```sh -bin/plugin install elasticsearch/elasticsearch-analysis-icu/2.4.3 +bin/plugin install elasticsearch/elasticsearch-analysis-icu/2.5.0 ``` You need to install a version matching your Elasticsearch version: From f9ec1ed232db7f27f0af6ef174481f9b0f620909 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 5 May 2015 12:09:26 -0400 Subject: [PATCH 117/131] Tests: fix tests not to use CWD --- .gitignore | 1 + .../index/analysis/SimpleIcuAnalysisTests.java | 1 + .../analysis/SimpleIcuCollationTokenFilterTests.java | 9 +++++++++ .../analysis/SimpleIcuNormalizerCharFilterTests.java | 2 ++ 4 files changed, 13 insertions(+) diff --git a/.gitignore b/.gitignore index e432ab87145..eb5dc3f5b62 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ /plugin_tools /.local-execution-hints.log /.local-*-execution-hints.log +/eclipse-build/ diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java index 8408a3231e0..8f1605ca1dd 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -33,6 +33,7 @@ public class SimpleIcuAnalysisTests extends ElasticsearchTestCase { @Test public void testDefaultsIcuAnalysis() { Settings settings = settingsBuilder() + .put("path.home", createTempDir()) .loadFromClasspath("org/elasticsearch/index/analysis/phonetic-1.yml").build(); AnalysisService analysisService = createAnalysisService(settings); diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java index 2ac85a4a5d2..446c1c9af20 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java @@ -49,6 +49,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { @Test public void testBasicUsage() throws Exception { Settings settings = ImmutableSettings.settingsBuilder() + .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "tr") .put("index.analysis.filter.myCollator.strength", "primary") @@ -65,6 +66,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { @Test public void testNormalization() throws IOException { Settings settings = ImmutableSettings.settingsBuilder() + .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "tr") .put("index.analysis.filter.myCollator.strength", "primary") @@ -82,6 +84,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { @Test public void testSecondaryStrength() throws IOException { Settings settings = ImmutableSettings.settingsBuilder() + .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "en") .put("index.analysis.filter.myCollator.strength", "secondary") @@ -100,6 +103,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { @Test public void testIgnorePunctuation() throws IOException { Settings settings = ImmutableSettings.settingsBuilder() + .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "en") .put("index.analysis.filter.myCollator.strength", "primary") @@ -118,6 +122,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { @Test public void testIgnoreWhitespace() throws IOException { Settings settings = ImmutableSettings.settingsBuilder() + .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "en") .put("index.analysis.filter.myCollator.strength", "primary") @@ -139,6 +144,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { @Test public void testNumerics() throws IOException { Settings settings = ImmutableSettings.settingsBuilder() + .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "en") .put("index.analysis.filter.myCollator.numeric", "true") @@ -156,6 +162,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { @Test public void testIgnoreAccentsButNotCase() throws IOException { Settings settings = ImmutableSettings.settingsBuilder() + .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "en") .put("index.analysis.filter.myCollator.strength", "primary") @@ -177,6 +184,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { @Test public void testUpperCaseFirst() throws IOException { Settings settings = ImmutableSettings.settingsBuilder() + .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "en") .put("index.analysis.filter.myCollator.strength", "tertiary") @@ -207,6 +215,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { String tailoredRules = tailoredCollator.getRules(); Settings settings = ImmutableSettings.settingsBuilder() + .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.rules", tailoredRules) .put("index.analysis.filter.myCollator.strength", "primary") diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java index b2e6e3dfb49..3bb39e7931b 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java @@ -39,6 +39,7 @@ public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase { public void testDefaultSetting() throws Exception { Settings settings = ImmutableSettings.settingsBuilder() + .put("path.home", createTempDir()) .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer") .build(); AnalysisService analysisService = createAnalysisService(settings); @@ -64,6 +65,7 @@ public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase { public void testNameAndModeSetting() throws Exception { Settings settings = ImmutableSettings.settingsBuilder() + .put("path.home", createTempDir()) .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer") .put("index.analysis.char_filter.myNormalizerChar.name", "nfkc") .put("index.analysis.char_filter.myNormalizerChar.mode", "decompose") From 175847c423481077cb5125c7ec974f0a361378b1 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 11 May 2015 18:35:02 -0400 Subject: [PATCH 118/131] enable security manager in tests --- pom.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/pom.xml b/pom.xml index be4d6ee02b8..9772bf53c90 100644 --- a/pom.xml +++ b/pom.xml @@ -34,6 +34,7 @@ 1 INFO + true From 1fad6fe356e498d1e7934705c3dca74199dbf836 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 11 May 2015 19:40:56 -0400 Subject: [PATCH 119/131] Remove outdated policy file --- dev-tools/tests.policy | 54 ------------------------------------------ 1 file changed, 54 deletions(-) delete mode 100644 dev-tools/tests.policy diff --git a/dev-tools/tests.policy b/dev-tools/tests.policy deleted file mode 100644 index 6afb5025840..00000000000 --- a/dev-tools/tests.policy +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Policy file to prevent tests from writing outside the test sandbox directory -// PLEASE NOTE: You may need to enable other permissions when new tests are added, -// everything not allowed here is forbidden! - -grant { - // permissions for file access, write access only to sandbox: - permission java.io.FilePermission "<>", "read,execute"; - permission java.io.FilePermission "${junit4.childvm.cwd}", "read,execute,write"; - permission java.io.FilePermission "${junit4.childvm.cwd}${/}-", "read,execute,write,delete"; - permission java.io.FilePermission "${junit4.tempDir}${/}*", "read,execute,write,delete"; - permission groovy.security.GroovyCodeSourcePermission "/groovy/script"; - - // Allow connecting to the internet anywhere - permission java.net.SocketPermission "*", "accept,listen,connect,resolve"; - - // Basic permissions needed for Lucene / Elasticsearch to work: - permission java.util.PropertyPermission "*", "read,write"; - permission java.lang.reflect.ReflectPermission "*"; - permission java.lang.RuntimePermission "*"; - - // These two *have* to be spelled out a separate - permission java.lang.management.ManagementPermission "control"; - permission java.lang.management.ManagementPermission "monitor"; - - permission java.net.NetPermission "*"; - permission java.util.logging.LoggingPermission "control"; - permission javax.management.MBeanPermission "*", "*"; - permission javax.management.MBeanServerPermission "*"; - permission javax.management.MBeanTrustPermission "*"; - - // Needed for some things in DNS caching in the JVM - permission java.security.SecurityPermission "getProperty.networkaddress.cache.ttl"; - permission java.security.SecurityPermission "getProperty.networkaddress.cache.negative.ttl"; - -}; From 546db210d8278b69b6b451517cf36e75131bd953 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 11 May 2015 21:22:58 -0400 Subject: [PATCH 120/131] remove unnecessary prop --- pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/pom.xml b/pom.xml index 9772bf53c90..be4d6ee02b8 100644 --- a/pom.xml +++ b/pom.xml @@ -34,7 +34,6 @@ 1 INFO - true From 4c860456e8a74338a5173d72b86cefae59ed7a94 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 21 May 2015 10:32:30 -0400 Subject: [PATCH 121/131] engage forbidden APIs --- pom.xml | 43 +------------------ .../IcuCollationTokenFilterFactory.java | 12 ++---- 2 files changed, 4 insertions(+), 51 deletions(-) diff --git a/pom.xml b/pom.xml index be4d6ee02b8..ec59c96a44a 100644 --- a/pom.xml +++ b/pom.xml @@ -41,10 +41,7 @@ org.hamcrest hamcrest-all - - com.carrotsearch.randomizedtesting - randomizedtesting-runner - + org.apache.lucene lucene-test-framework @@ -74,48 +71,10 @@ - - - src/main/resources - true - - - - org.apache.maven.plugins - maven-compiler-plugin - - - com.carrotsearch.randomizedtesting - junit4-maven-plugin - - - org.apache.maven.plugins - maven-surefire-plugin - - - - org.apache.maven.plugins - maven-source-plugin - org.apache.maven.plugins maven-assembly-plugin - - false - ${project.build.directory}/releases/ - - ${basedir}/src/main/assemblies/plugin.xml - - - - - package - - single - - - diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java index b250dc3c56e..2460fd7a911 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -34,9 +34,7 @@ import org.elasticsearch.index.Index; import org.elasticsearch.index.settings.IndexSettings; import java.io.IOException; -import java.net.URISyntaxException; import java.nio.file.Files; -import java.nio.file.Paths; /** * An ICU based collation token filter. There are two ways to configure collation: @@ -60,15 +58,11 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { Collator collator; String rules = settings.get("rules"); if (rules != null) { - FailedToResolveConfigException failureToResolve = null; + Exception failureToResolve = null; try { - rules = Streams.copyToString(Files.newBufferedReader(Paths.get(environment.resolveConfig(rules).toURI()), Charsets.UTF_8)); - } catch (FailedToResolveConfigException e) { + rules = Streams.copyToString(Files.newBufferedReader(environment.configFile().resolve(rules), Charsets.UTF_8)); + } catch (FailedToResolveConfigException | IOException | SecurityException e) { failureToResolve = e; - } catch (IOException e) { - throw new IllegalArgumentException("Failed to load collation rules", e); - } catch (URISyntaxException e) { - throw new IllegalArgumentException("Failed to load collation rules", e); } try { collator = new RuleBasedCollator(rules); From 0e17ce7f18c5f99c27c11fac500fab75903d894e Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 21 May 2015 16:47:00 -0400 Subject: [PATCH 122/131] remove duplicate test config --- pom.xml | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/pom.xml b/pom.xml index ec59c96a44a..7abd7f6b225 100644 --- a/pom.xml +++ b/pom.xml @@ -37,37 +37,10 @@ - - org.hamcrest - hamcrest-all - - - - org.apache.lucene - lucene-test-framework - - - - org.elasticsearch - elasticsearch - - org.apache.lucene lucene-analyzers-icu - - - log4j - log4j - - - - org.elasticsearch - elasticsearch - test-jar - - From 4fcd2643f210edc43bc8ca5e246d27d3c29d5245 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 21 May 2015 22:50:30 -0400 Subject: [PATCH 123/131] switch to plugin pom --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 7abd7f6b225..b2e61567214 100644 --- a/pom.xml +++ b/pom.xml @@ -27,7 +27,7 @@ org.elasticsearch - elasticsearch-parent + elasticsearch-plugin 2.0.0-SNAPSHOT From fbc74ff3393179d2015387913793e5348fc4ef75 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 22 May 2015 00:02:25 -0400 Subject: [PATCH 124/131] remove tabs --- .../IcuFoldingTokenFilterFactory.java | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java index 6badfd48c12..7abfd702ac3 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java @@ -52,21 +52,21 @@ public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory { } @Override public TokenStream create(TokenStream tokenStream) { - - // The ICUFoldingFilter is in fact implemented as a ICUNormalizer2Filter. - // ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here - if (unicodeSetFilter != null) { - Normalizer2 base = Normalizer2.getInstance( - ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), - "utr30", Normalizer2.Mode.COMPOSE); - UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter); - unicodeSet.freeze(); - Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet); - return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered); - } - else { - return new ICUFoldingFilter(tokenStream); - } + // The ICUFoldingFilter is in fact implemented as a ICUNormalizer2Filter. + // ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here + if (unicodeSetFilter != null) { + Normalizer2 base = Normalizer2.getInstance( + ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), + "utr30", Normalizer2.Mode.COMPOSE); + UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter); + + unicodeSet.freeze(); + Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet); + return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered); + } + else { + return new ICUFoldingFilter(tokenStream); + } } } From bd911c121c5579d99b59760d9c2507c7d5a05ea4 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 22 May 2015 09:20:11 -0400 Subject: [PATCH 125/131] remove logging properties --- src/test/resources/log4j.properties | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 src/test/resources/log4j.properties diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties deleted file mode 100644 index 497c97f9959..00000000000 --- a/src/test/resources/log4j.properties +++ /dev/null @@ -1,5 +0,0 @@ -log4j.rootLogger=INFO, out - -log4j.appender.out=org.apache.log4j.ConsoleAppender -log4j.appender.out.layout=org.apache.log4j.PatternLayout -log4j.appender.out.layout.conversionPattern=[%d{ISO8601}][%-5p][%-25c] %m%n From ca889286ebb26451a8c17f9f31cb62f4426c7e7a Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 22 May 2015 18:23:54 -0400 Subject: [PATCH 126/131] fix license headers --- pom.xml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pom.xml b/pom.xml index b2e61567214..5014c60a91b 100644 --- a/pom.xml +++ b/pom.xml @@ -49,6 +49,18 @@ org.apache.maven.plugins maven-assembly-plugin + + com.mycila + license-maven-plugin + + + + **/IndexableBinaryStringTools.java + **/ICUCollationKeyFilter.java + **/TestIndexableBinaryStringTools.java + + + From 3f539dcf087a0e1f6035d48a5f759d83d065f5d2 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sat, 23 May 2015 08:23:37 -0400 Subject: [PATCH 127/131] Allow license generation to run --- pom.xml | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/pom.xml b/pom.xml index 5014c60a91b..2024908cd61 100644 --- a/pom.xml +++ b/pom.xml @@ -49,19 +49,24 @@ org.apache.maven.plugins maven-assembly-plugin - - com.mycila - license-maven-plugin - - - - **/IndexableBinaryStringTools.java - **/ICUCollationKeyFilter.java - **/TestIndexableBinaryStringTools.java - - - - + + + + + + com.mycila + license-maven-plugin + + + + **/IndexableBinaryStringTools.java + **/ICUCollationKeyFilter.java + **/TestIndexableBinaryStringTools.java + + + + + From 97e60161378ab98843e07d33005c8c143b41bbf9 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Tue, 26 May 2015 08:05:53 -0400 Subject: [PATCH 128/131] Absorb ImmutableSettings into Settings --- .../index/analysis/AnalysisTestUtils.java | 2 +- .../index/analysis/ICUIntegrationTests.java | 5 ++--- .../analysis/SimpleIcuAnalysisTests.java | 2 +- .../SimpleIcuCollationTokenFilterTests.java | 19 +++++++++---------- .../SimpleIcuNormalizerCharFilterTests.java | 5 ++--- 5 files changed, 15 insertions(+), 18 deletions(-) diff --git a/src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java b/src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java index dc619d22e2f..dec0e5dd632 100644 --- a/src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java +++ b/src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java @@ -33,7 +33,7 @@ import org.elasticsearch.index.settings.IndexSettingsModule; import org.elasticsearch.indices.analysis.IndicesAnalysisModule; import org.elasticsearch.indices.analysis.IndicesAnalysisService; -import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; +import static org.elasticsearch.common.settings.Settings.settingsBuilder; public class AnalysisTestUtils { diff --git a/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java b/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java index 95874c98b07..058f6c0b7b7 100644 --- a/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java @@ -21,7 +21,6 @@ package org.elasticsearch.index.analysis; import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse; import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.index.query.QueryBuilders; @@ -41,7 +40,7 @@ public class ICUIntegrationTests extends ElasticsearchIntegrationTest { @Override protected Settings nodeSettings(int nodeOrdinal) { - return ImmutableSettings.builder() + return Settings.builder() .put(super.nodeSettings(nodeOrdinal)) .put("plugins." + PluginsService.LOAD_PLUGIN_FROM_CLASSPATH, true) .build(); @@ -49,7 +48,7 @@ public class ICUIntegrationTests extends ElasticsearchIntegrationTest { @Override public Settings indexSettings() { - Settings settings = ImmutableSettings.builder() + Settings settings = Settings.builder() .put(super.indexSettings()) .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") .putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_collator") diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java index 8f1605ca1dd..672cf7c651d 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -23,7 +23,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.test.ElasticsearchTestCase; import org.junit.Test; -import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; +import static org.elasticsearch.common.settings.Settings.settingsBuilder; import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService; import static org.hamcrest.Matchers.instanceOf; /** diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java index 446c1c9af20..9f2d3fbf65e 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java @@ -26,7 +26,6 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.test.ElasticsearchTestCase; import org.junit.Test; @@ -48,7 +47,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { */ @Test public void testBasicUsage() throws Exception { - Settings settings = ImmutableSettings.settingsBuilder() + Settings settings = Settings.settingsBuilder() .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "tr") @@ -65,7 +64,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { */ @Test public void testNormalization() throws IOException { - Settings settings = ImmutableSettings.settingsBuilder() + Settings settings = Settings.settingsBuilder() .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "tr") @@ -83,7 +82,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { */ @Test public void testSecondaryStrength() throws IOException { - Settings settings = ImmutableSettings.settingsBuilder() + Settings settings = Settings.settingsBuilder() .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "en") @@ -102,7 +101,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { */ @Test public void testIgnorePunctuation() throws IOException { - Settings settings = ImmutableSettings.settingsBuilder() + Settings settings = Settings.settingsBuilder() .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "en") @@ -121,7 +120,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { */ @Test public void testIgnoreWhitespace() throws IOException { - Settings settings = ImmutableSettings.settingsBuilder() + Settings settings = Settings.settingsBuilder() .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "en") @@ -143,7 +142,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { */ @Test public void testNumerics() throws IOException { - Settings settings = ImmutableSettings.settingsBuilder() + Settings settings = Settings.settingsBuilder() .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "en") @@ -161,7 +160,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { */ @Test public void testIgnoreAccentsButNotCase() throws IOException { - Settings settings = ImmutableSettings.settingsBuilder() + Settings settings = Settings.settingsBuilder() .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "en") @@ -183,7 +182,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { */ @Test public void testUpperCaseFirst() throws IOException { - Settings settings = ImmutableSettings.settingsBuilder() + Settings settings = Settings.settingsBuilder() .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.language", "en") @@ -214,7 +213,7 @@ public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); String tailoredRules = tailoredCollator.getRules(); - Settings settings = ImmutableSettings.settingsBuilder() + Settings settings = Settings.settingsBuilder() .put("path.home", createTempDir()) .put("index.analysis.filter.myCollator.type", "icu_collation") .put("index.analysis.filter.myCollator.rules", tailoredRules) diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java index 3bb39e7931b..ef747e6f56e 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java @@ -21,7 +21,6 @@ package org.elasticsearch.index.analysis; import com.ibm.icu.text.Normalizer2; import org.apache.lucene.analysis.CharFilter; -import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.test.ElasticsearchTestCase; import org.junit.Test; @@ -38,7 +37,7 @@ public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase { @Test public void testDefaultSetting() throws Exception { - Settings settings = ImmutableSettings.settingsBuilder() + Settings settings = Settings.settingsBuilder() .put("path.home", createTempDir()) .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer") .build(); @@ -64,7 +63,7 @@ public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase { @Test public void testNameAndModeSetting() throws Exception { - Settings settings = ImmutableSettings.settingsBuilder() + Settings settings = Settings.settingsBuilder() .put("path.home", createTempDir()) .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer") .put("index.analysis.char_filter.myNormalizerChar.name", "nfkc") From 2aea018feb360b504333185393417456c976951c Mon Sep 17 00:00:00 2001 From: Gasol Wu Date: Sun, 31 May 2015 23:02:42 +0800 Subject: [PATCH 129/131] Update documentation for ICU Transform Fixes #40 --- README.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/README.md b/README.md index 95d955980d4..46960a69b86 100644 --- a/README.md +++ b/README.md @@ -224,6 +224,52 @@ Here is a sample settings: } ``` +ICU Transform +------------- +Transforms are used to process Unicode text in many different ways. Some include case mapping, normalization, +transliteration and bidirectional text handling. + +You can defined transliterator identifiers by using `id` property, and specify direction to `forward` or `reverse` by +using `dir` property, The default value of both properties are `Null` and `forward`. + +For example: + +```js +{ + "index" : { + "analysis" : { + "analyzer" : { + "latin" : { + "tokenizer" : "keyword", + "filter" : ["myLatinTransform"] + } + }, + "filter" : { + "myLatinTransform" : { + "type" : "icu_transform", + "id" : "Any-Latin; NFD; [:Nonspacing Mark:] Remove; NFC" + } + } + } + } +} +``` + +This transform transliterated characters to latin, and separates accents from their base characters, removes the accents, +and then puts the remaining text into an unaccented form. + +The results are: + +`你好` to `ni hao` + +`здравствуйте` to `zdravstvujte` + +`こんにちは` to `kon'nichiha` + +Currently the filter only supports identifier and direction, custom rulesets are not yet supported. + +For more documentation, Please see the [user guide of ICU Transform](http://userguide.icu-project.org/transforms/general). + License ------- From da054171ca5a535da337ab296507703c207ff49b Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 4 Jun 2015 22:24:45 +0200 Subject: [PATCH 130/131] fix analysis-icu to not use shaded APIs --- .../index/analysis/IcuCollationTokenFilterFactory.java | 4 ++-- .../plugin/analysis/icu/AnalysisICUPlugin.java | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java index 2460fd7a911..2890d135cd2 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -23,7 +23,6 @@ import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; import org.apache.lucene.analysis.TokenStream; -import org.elasticsearch.common.base.Charsets; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.io.Streams; @@ -34,6 +33,7 @@ import org.elasticsearch.index.Index; import org.elasticsearch.index.settings.IndexSettings; import java.io.IOException; +import java.nio.charset.Charset; import java.nio.file.Files; /** @@ -60,7 +60,7 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { if (rules != null) { Exception failureToResolve = null; try { - rules = Streams.copyToString(Files.newBufferedReader(environment.configFile().resolve(rules), Charsets.UTF_8)); + rules = Streams.copyToString(Files.newBufferedReader(environment.configFile().resolve(rules), Charset.forName("UTF-8"))); } catch (FailedToResolveConfigException | IOException | SecurityException e) { failureToResolve = e; } diff --git a/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java index a1cafe6508c..be73376fd24 100644 --- a/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java +++ b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java @@ -19,13 +19,13 @@ package org.elasticsearch.plugin.analysis.icu; -import org.elasticsearch.common.collect.ImmutableList; import org.elasticsearch.common.inject.Module; import org.elasticsearch.index.analysis.AnalysisModule; import org.elasticsearch.index.analysis.IcuAnalysisBinderProcessor; import org.elasticsearch.indices.analysis.IcuIndicesAnalysisModule; import org.elasticsearch.plugins.AbstractPlugin; +import java.util.ArrayList; import java.util.Collection; /** @@ -45,7 +45,9 @@ public class AnalysisICUPlugin extends AbstractPlugin { @Override public Collection> modules() { - return ImmutableList.>of(IcuIndicesAnalysisModule.class); + Collection> classes = new ArrayList<>(); + classes.add(IcuIndicesAnalysisModule.class); + return classes; } /** From ed3cc8d03449137a6144716a22646a7b4555c569 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Fri, 5 Jun 2015 13:12:23 +0200 Subject: [PATCH 131/131] add analysis-icu module --- .gitignore | 14 -- CONTRIBUTING.md | 98 --------- LICENSE.txt | 202 ------------------ dev-tools/release.py | 134 ------------ README.md => plugins/analysis-icu/README.md | 0 pom.xml => plugins/analysis-icu/pom.xml | 26 +-- .../src}/main/assemblies/plugin.xml | 0 .../index/analysis/ICUCollationKeyFilter.java | 0 .../analysis/IcuAnalysisBinderProcessor.java | 0 .../IcuCollationTokenFilterFactory.java | 0 .../IcuFoldingTokenFilterFactory.java | 0 .../IcuNormalizerCharFilterFactory.java | 0 .../IcuNormalizerTokenFilterFactory.java | 0 .../index/analysis/IcuTokenizerFactory.java | 0 .../IcuTransformTokenFilterFactory.java | 0 .../analysis/IndexableBinaryStringTools.java | 0 .../indices/analysis/IcuIndicesAnalysis.java | 0 .../analysis/IcuIndicesAnalysisModule.java | 0 .../analysis/icu/AnalysisICUPlugin.java | 0 .../src}/main/resources/es-plugin.properties | 0 .../index/analysis/AnalysisTestUtils.java | 0 .../index/analysis/ICUIntegrationTests.java | 0 .../analysis/SimpleIcuAnalysisTests.java | 0 .../SimpleIcuCollationTokenFilterTests.java | 0 .../SimpleIcuNormalizerCharFilterTests.java | 0 .../TestIndexableBinaryStringTools.java | 0 26 files changed, 2 insertions(+), 472 deletions(-) delete mode 100644 .gitignore delete mode 100644 CONTRIBUTING.md delete mode 100644 LICENSE.txt delete mode 100644 dev-tools/release.py rename README.md => plugins/analysis-icu/README.md (100%) rename pom.xml => plugins/analysis-icu/pom.xml (67%) rename {src => plugins/analysis-icu/src}/main/assemblies/plugin.xml (100%) rename {src => plugins/analysis-icu/src}/main/java/org/elasticsearch/index/analysis/ICUCollationKeyFilter.java (100%) rename {src => plugins/analysis-icu/src}/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java (100%) rename {src => plugins/analysis-icu/src}/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java (100%) rename {src => plugins/analysis-icu/src}/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java (100%) rename {src => plugins/analysis-icu/src}/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java (100%) rename {src => plugins/analysis-icu/src}/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java (100%) rename {src => plugins/analysis-icu/src}/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java (100%) rename {src => plugins/analysis-icu/src}/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java (100%) rename {src => plugins/analysis-icu/src}/main/java/org/elasticsearch/index/analysis/IndexableBinaryStringTools.java (100%) rename {src => plugins/analysis-icu/src}/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java (100%) rename {src => plugins/analysis-icu/src}/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java (100%) rename {src => plugins/analysis-icu/src}/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java (100%) rename {src => plugins/analysis-icu/src}/main/resources/es-plugin.properties (100%) rename {src => plugins/analysis-icu/src}/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java (100%) rename {src => plugins/analysis-icu/src}/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java (100%) rename {src => plugins/analysis-icu/src}/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java (100%) rename {src => plugins/analysis-icu/src}/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java (100%) rename {src => plugins/analysis-icu/src}/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java (100%) rename {src => plugins/analysis-icu/src}/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java (100%) diff --git a/.gitignore b/.gitignore deleted file mode 100644 index eb5dc3f5b62..00000000000 --- a/.gitignore +++ /dev/null @@ -1,14 +0,0 @@ -/data -/work -/logs -/.idea -/target -.DS_Store -*.iml -/.settings -/.classpath -/.project -/plugin_tools -/.local-execution-hints.log -/.local-*-execution-hints.log -/eclipse-build/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 238e8c368f1..00000000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,98 +0,0 @@ -Contributing to elasticsearch -============================= - -Elasticsearch is an open source project and we love to receive contributions from our community — you! There are many ways to contribute, from writing tutorials or blog posts, improving the documentation, submitting bug reports and feature requests or writing code which can be incorporated into Elasticsearch itself. - -Bug reports ------------ - -If you think you have found a bug in Elasticsearch, first make sure that you are testing against the [latest version of Elasticsearch](http://www.elasticsearch.org/download/) - your issue may already have been fixed. If not, search our [issues list](https://github.com/elasticsearch/elasticsearch/issues) on GitHub in case a similar issue has already been opened. - -It is very helpful if you can prepare a reproduction of the bug. In other words, provide a small test case which we can run to confirm your bug. It makes it easier to find the problem and to fix it. Test cases should be provided as `curl` commands which we can copy and paste into a terminal to run it locally, for example: - -```sh -# delete the index -curl -XDELETE localhost:9200/test - -# insert a document -curl -XPUT localhost:9200/test/test/1 -d '{ - "title": "test document" -}' - -# this should return XXXX but instead returns YYY -curl .... -``` - -Provide as much information as you can. You may think that the problem lies with your query, when actually it depends on how your data is indexed. The easier it is for us to recreate your problem, the faster it is likely to be fixed. - -Feature requests ----------------- - -If you find yourself wishing for a feature that doesn't exist in Elasticsearch, you are probably not alone. There are bound to be others out there with similar needs. Many of the features that Elasticsearch has today have been added because our users saw the need. -Open an issue on our [issues list](https://github.com/elasticsearch/elasticsearch/issues) on GitHub which describes the feature you would like to see, why you need it, and how it should work. - -Contributing code and documentation changes -------------------------------------------- - -If you have a bugfix or new feature that you would like to contribute to Elasticsearch, please find or open an issue about it first. Talk about what you would like to do. It may be that somebody is already working on it, or that there are particular issues that you should know about before implementing the change. - -We enjoy working with contributors to get their code accepted. There are many approaches to fixing a problem and it is important to find the best approach before writing too much code. - -The process for contributing to any of the [Elasticsearch repositories](https://github.com/elasticsearch/) is similar. Details for individual projects can be found below. - -### Fork and clone the repository - -You will need to fork the main Elasticsearch code or documentation repository and clone it to your local machine. See -[github help page](https://help.github.com/articles/fork-a-repo) for help. - -Further instructions for specific projects are given below. - -### Submitting your changes - -Once your changes and tests are ready to submit for review: - -1. Test your changes -Run the test suite to make sure that nothing is broken. - -2. Sign the Contributor License Agreement -Please make sure you have signed our [Contributor License Agreement](http://www.elasticsearch.org/contributor-agreement/). We are not asking you to assign copyright to us, but to give us the right to distribute your code without restriction. We ask this of all contributors in order to assure our users of the origin and continuing existence of the code. You only need to sign the CLA once. - -3. Rebase your changes -Update your local repository with the most recent code from the main Elasticsearch repository, and rebase your branch on top of the latest master branch. We prefer your changes to be squashed into a single commit. - -4. Submit a pull request -Push your local changes to your forked copy of the repository and [submit a pull request](https://help.github.com/articles/using-pull-requests). In the pull request, describe what your changes do and mention the number of the issue where discussion has taken place, eg "Closes #123". - -Then sit back and wait. There will probably be discussion about the pull request and, if any changes are needed, we would love to work with you to get your pull request merged into Elasticsearch. - - -Contributing to the Elasticsearch plugin ----------------------------------------- - -**Repository:** [https://github.com/elasticsearch/elasticsearch-analysis-icu](https://github.com/elasticsearch/elasticsearch-analysis-icu) - -Make sure you have [Maven](http://maven.apache.org) installed, as Elasticsearch uses it as its build system. Integration with IntelliJ and Eclipse should work out of the box. Eclipse users can automatically configure their IDE by running `mvn eclipse:eclipse` and then importing the project into their workspace: `File > Import > Existing project into workspace`. - -Please follow these formatting guidelines: - -* Java indent is 4 spaces -* Line width is 140 characters -* The rest is left to Java coding standards -* Disable “auto-format on save” to prevent unnecessary format changes. This makes reviews much harder as it generates unnecessary formatting changes. If your IDE supports formatting only modified chunks that is fine to do. - -To create a distribution from the source, simply run: - -```sh -cd elasticsearch-analysis-icu/ -mvn clean package -DskipTests -``` - -You will find the newly built packages under: `./target/releases/`. - -Before submitting your changes, run the test suite to make sure that nothing is broken, with: - -```sh -mvn clean test -``` - -Source: [Contributing to elasticsearch](http://www.elasticsearch.org/contributing-to-elasticsearch/) diff --git a/LICENSE.txt b/LICENSE.txt deleted file mode 100644 index d6456956733..00000000000 --- a/LICENSE.txt +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/dev-tools/release.py b/dev-tools/release.py deleted file mode 100644 index edcc637d068..00000000000 --- a/dev-tools/release.py +++ /dev/null @@ -1,134 +0,0 @@ -# Licensed to Elasticsearch under one or more contributor -# license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright -# ownership. Elasticsearch licenses this file to you under -# the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on -# an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, -# either express or implied. See the License for the specific -# language governing permissions and limitations under the License. - -import datetime -import os -import shutil -import sys -import time -import urllib -import urllib.request -import zipfile - -from os.path import dirname, abspath - -""" - This tool builds a release from the a given elasticsearch plugin branch. - - It is basically a wrapper on top of launch_release.py which: - - - tries to get a more recent version of launch_release.py in ... - - download it if needed - - launch it passing all arguments to it, like: - - $ python3 dev_tools/release.py --branch master --publish --remote origin - - Important options: - - # Dry run - $ python3 dev_tools/release.py - - # Dry run without tests - python3 dev_tools/release.py --skiptests - - # Release, publish artifacts and announce - $ python3 dev_tools/release.py --publish - - See full documentation in launch_release.py -""" -env = os.environ - -# Change this if the source repository for your scripts is at a different location -SOURCE_REPO = 'elasticsearch/elasticsearch-plugins-script' -# We define that we should download again the script after 1 days -SCRIPT_OBSOLETE_DAYS = 1 -# We ignore in master.zip file the following files -IGNORED_FILES = ['.gitignore', 'README.md'] - - -ROOT_DIR = abspath(os.path.join(abspath(dirname(__file__)), '../')) -TARGET_TOOLS_DIR = ROOT_DIR + '/plugin_tools' -DEV_TOOLS_DIR = ROOT_DIR + '/dev-tools' -BUILD_RELEASE_FILENAME = 'release.zip' -BUILD_RELEASE_FILE = TARGET_TOOLS_DIR + '/' + BUILD_RELEASE_FILENAME -SOURCE_URL = 'https://github.com/%s/archive/master.zip' % SOURCE_REPO - -# Download a recent version of the release plugin tool -try: - os.mkdir(TARGET_TOOLS_DIR) - print('directory %s created' % TARGET_TOOLS_DIR) -except FileExistsError: - pass - - -try: - # we check latest update. If we ran an update recently, we - # are not going to check it again - download = True - - try: - last_download_time = datetime.datetime.fromtimestamp(os.path.getmtime(BUILD_RELEASE_FILE)) - if (datetime.datetime.now()-last_download_time).days < SCRIPT_OBSOLETE_DAYS: - download = False - except FileNotFoundError: - pass - - if download: - urllib.request.urlretrieve(SOURCE_URL, BUILD_RELEASE_FILE) - with zipfile.ZipFile(BUILD_RELEASE_FILE) as myzip: - for member in myzip.infolist(): - filename = os.path.basename(member.filename) - # skip directories - if not filename: - continue - if filename in IGNORED_FILES: - continue - - # copy file (taken from zipfile's extract) - source = myzip.open(member.filename) - target = open(os.path.join(TARGET_TOOLS_DIR, filename), "wb") - with source, target: - shutil.copyfileobj(source, target) - # We keep the original date - date_time = time.mktime(member.date_time + (0, 0, -1)) - os.utime(os.path.join(TARGET_TOOLS_DIR, filename), (date_time, date_time)) - print('plugin-tools updated from %s' % SOURCE_URL) -except urllib.error.HTTPError: - pass - - -# Let see if we need to update the release.py script itself -source_time = os.path.getmtime(TARGET_TOOLS_DIR + '/release.py') -repo_time = os.path.getmtime(DEV_TOOLS_DIR + '/release.py') -if source_time > repo_time: - input('release.py needs an update. Press a key to update it...') - shutil.copyfile(TARGET_TOOLS_DIR + '/release.py', DEV_TOOLS_DIR + '/release.py') - -# We can launch the build process -try: - PYTHON = 'python' - # make sure python3 is used if python3 is available - # some systems use python 2 as default - os.system('python3 --version > /dev/null 2>&1') - PYTHON = 'python3' -except RuntimeError: - pass - -release_args = '' -for x in range(1, len(sys.argv)): - release_args += ' ' + sys.argv[x] - -os.system('%s %s/build_release.py %s' % (PYTHON, TARGET_TOOLS_DIR, release_args)) diff --git a/README.md b/plugins/analysis-icu/README.md similarity index 100% rename from README.md rename to plugins/analysis-icu/README.md diff --git a/pom.xml b/plugins/analysis-icu/pom.xml similarity index 67% rename from pom.xml rename to plugins/analysis-icu/pom.xml index 2024908cd61..701ffbfc405 100644 --- a/pom.xml +++ b/plugins/analysis-icu/pom.xml @@ -3,27 +3,12 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - org.elasticsearch + org.elasticsearch.plugin elasticsearch-analysis-icu - 3.0.0-SNAPSHOT + jar Elasticsearch ICU Analysis plugin The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. - https://github.com/elastic/elasticsearch-analysis-icu/ - 2009 - - - The Apache Software License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - - - - scm:git:git@github.com:elastic/elasticsearch-analysis-icu.git - scm:git:git@github.com:elastic/elasticsearch-analysis-icu.git - - http://github.com/elastic/elasticsearch-analysis-icu - org.elasticsearch @@ -69,11 +54,4 @@ - - - oss-snapshots - Sonatype OSS Snapshots - https://oss.sonatype.org/content/repositories/snapshots/ - - diff --git a/src/main/assemblies/plugin.xml b/plugins/analysis-icu/src/main/assemblies/plugin.xml similarity index 100% rename from src/main/assemblies/plugin.xml rename to plugins/analysis-icu/src/main/assemblies/plugin.xml diff --git a/src/main/java/org/elasticsearch/index/analysis/ICUCollationKeyFilter.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/ICUCollationKeyFilter.java similarity index 100% rename from src/main/java/org/elasticsearch/index/analysis/ICUCollationKeyFilter.java rename to plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/ICUCollationKeyFilter.java diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java similarity index 100% rename from src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java rename to plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java similarity index 100% rename from src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java rename to plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java similarity index 100% rename from src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java rename to plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java similarity index 100% rename from src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java rename to plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java similarity index 100% rename from src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java rename to plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java similarity index 100% rename from src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java rename to plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java similarity index 100% rename from src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java rename to plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java diff --git a/src/main/java/org/elasticsearch/index/analysis/IndexableBinaryStringTools.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IndexableBinaryStringTools.java similarity index 100% rename from src/main/java/org/elasticsearch/index/analysis/IndexableBinaryStringTools.java rename to plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IndexableBinaryStringTools.java diff --git a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java similarity index 100% rename from src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java rename to plugins/analysis-icu/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java diff --git a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java similarity index 100% rename from src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java rename to plugins/analysis-icu/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java diff --git a/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java similarity index 100% rename from src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java rename to plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java diff --git a/src/main/resources/es-plugin.properties b/plugins/analysis-icu/src/main/resources/es-plugin.properties similarity index 100% rename from src/main/resources/es-plugin.properties rename to plugins/analysis-icu/src/main/resources/es-plugin.properties diff --git a/src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java similarity index 100% rename from src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java rename to plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java diff --git a/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java similarity index 100% rename from src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java rename to plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java similarity index 100% rename from src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java rename to plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java similarity index 100% rename from src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java rename to plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java similarity index 100% rename from src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java rename to plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java diff --git a/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java similarity index 100% rename from src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java rename to plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java