diff --git a/README.md b/README.md index 491053e4b9c..40aab5a20ce 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Normalizes characters as explained [here](http://userguide.icu-project.org/trans "index" : { "analysis" : { "analyzer" : { - "collation" : { + "normalized" : { "tokenizer" : "keyword", "filter" : ["icu_normalizer"] } @@ -61,7 +61,7 @@ Folding of unicode characters based on `UTR#30`. It registers itself under `icu_ "index" : { "analysis" : { "analyzer" : { - "collation" : { + "folded" : { "tokenizer" : "keyword", "filter" : ["icu_folding"] } @@ -101,81 +101,6 @@ The Following example exempts Swedish characters from the folding. Note that the } ``` -ICU Collation -------------- - -Uses collation token filter. Allows to either specify the rules for collation -(defined [here](http://www.icu-project.org/userguide/Collate_Customization.html)) using the `rules` parameter -(can point to a location or expressed in the settings, location can be relative to config location), or using the -`language` parameter (further specialized by country and variant). By default registers under `icu_collation` or -`icuCollation` and uses the default locale. - -Here is a sample settings: - -```js -{ - "index" : { - "analysis" : { - "analyzer" : { - "collation" : { - "tokenizer" : "keyword", - "filter" : ["icu_collation"] - } - } - } - } -} -``` - -And here is a sample of custom collation: - -```js -{ - "index" : { - "analysis" : { - "analyzer" : { - "collation" : { - "tokenizer" : "keyword", - "filter" : ["myCollator"] - } - }, - "filter" : { - "myCollator" : { - "type" : "icu_collation", - "language" : "en" - } - } - } - } -} -``` - -Optional options: -* `strength` - The strength property determines the minimum level of difference considered significant during comparison. - The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator. - Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`. - See [ICU Collation](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html) documentation for a more detailed - explanation for the specific values. -* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with -`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were -normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form -before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between -faster and more complete collation behavior. Since a great many of the world's languages do not require text -normalization, most locales set `no` as the default decomposition mode. - -Expert options: -* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary` - to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace. -* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When - strength is set to `primary` this will ignore accent differences. -* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored - for strength `tertiary`. -* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For - example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`. -* `variableTop` - Single character or contraction. Controls what is variable for `alternate`. -* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana - and Hiragana characters in `quaternary` strength . - ICU Tokenizer ------------- @@ -186,7 +111,7 @@ Breaks text into words according to [UAX #29: Unicode Text Segmentation](http:// "index" : { "analysis" : { "analyzer" : { - "collation" : { + "tokenized" : { "tokenizer" : "icu_tokenizer", } } @@ -211,7 +136,7 @@ Here is a sample settings: "index" : { "analysis" : { "analyzer" : { - "collation" : { + "normalized" : { "tokenizer" : "keyword", "char_filter" : ["icu_normalizer"] } diff --git a/pom.xml b/pom.xml index 055c3882a58..938ddef943e 100644 --- a/pom.xml +++ b/pom.xml @@ -33,8 +33,8 @@ 2.0.0-SNAPSHOT - 4.10.2 - 4.10.2 + 5.0.0 + 5.0.0-snapshot-1636426 1 true onerror @@ -47,6 +47,10 @@ sonatype http://oss.sonatype.org/content/repositories/releases/ + + Lucene snapshots + https://download.elasticsearch.org/lucenesnapshots/maven/ + diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java index 0e2a9799daf..0e2bc7a13bb 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -23,7 +23,6 @@ import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.collation.ICUCollationKeyFilter; import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; @@ -174,6 +173,7 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { @Override public TokenStream create(TokenStream tokenStream) { - return new ICUCollationKeyFilter(tokenStream, collator); + throw new UnsupportedOperationException("i was deprecated in lucene 4, and now i'm gone"); + // TODO: lucene does sort keys as binary keys since 4.x } } diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java index 168e85f8d2a..fe20d93069e 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java @@ -39,8 +39,8 @@ public class IcuTokenizerFactory extends AbstractTokenizerFactory { } @Override - public Tokenizer create(Reader reader) { - return new ICUTokenizer(reader); + public Tokenizer create() { + return new ICUTokenizer(); } } diff --git a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java index eb0edf76185..a4330efd0d2 100644 --- a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java +++ b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java @@ -19,7 +19,6 @@ package org.elasticsearch.indices.analysis; -import com.ibm.icu.text.Collator; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.Transliterator; import org.apache.lucene.analysis.TokenStream; @@ -27,7 +26,6 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.icu.ICUFoldingFilter; import org.apache.lucene.analysis.icu.ICUTransformFilter; import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; -import org.apache.lucene.collation.ICUCollationKeyFilter; import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.settings.Settings; @@ -36,8 +34,6 @@ import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; -import java.io.Reader; - /** * Registers indices level analysis components so, if not explicitly configured, will be shared * among all indices. @@ -55,8 +51,8 @@ public class IcuIndicesAnalysis extends AbstractComponent { } @Override - public Tokenizer create(Reader reader) { - return new ICUTokenizer(reader); + public Tokenizer create() { + return new ICUTokenizer(); } })); @@ -85,18 +81,6 @@ public class IcuIndicesAnalysis extends AbstractComponent { } })); - indicesAnalysisService.tokenFilterFactories().put("icu_collation", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "icu_collation"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new ICUCollationKeyFilter(tokenStream, Collator.getInstance()); - } - })); - indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { diff --git a/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java b/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java index 95874c98b07..d8a13b0febe 100644 --- a/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java @@ -52,10 +52,8 @@ public class ICUIntegrationTests extends ElasticsearchIntegrationTest { Settings settings = ImmutableSettings.builder() .put(super.indexSettings()) .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") - .putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_collator") - .put("index.analysis.filter.my_collator.type", "icu_collation") - .put("index.analysis.filter.my_collator.language", "en") - .put("index.analysis.filter.my_collator.strength", "primary") + .putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "my_folding") + .put("index.analysis.filter.my_folding.type", "icu_folding") .build(); return settings; diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java deleted file mode 100644 index 5098a791f66..00000000000 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java +++ /dev/null @@ -1,303 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.analysis; - -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.util.ULocale; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.elasticsearch.common.settings.ImmutableSettings; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.Index; -import org.elasticsearch.test.ElasticsearchTestCase; -import org.junit.Test; - -import java.io.IOException; -import java.io.StringReader; - -import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService; -import static org.hamcrest.Matchers.equalTo; - -// Tests borrowed from Solr's Icu collation key filter factory test. -public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase { - - /* - * Turkish has some funny casing. - * This test shows how you can solve this kind of thing easily with collation. - * Instead of using LowerCaseFilter, use a turkish collator with primary strength. - * Then things will sort and match correctly. - */ - @Test - public void testBasicUsage() throws Exception { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "tr") - .put("index.analysis.filter.myCollator.strength", "primary") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String turkishUpperCase = "I WİLL USE TURKİSH CASING"; - String turkishLowerCase = "ı will use turkish casıng"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase))); - TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase))); - assertCollatesToSame(tsUpper, tsLower); - } - - /* - * Test usage of the decomposition option for unicode normalization. - */ - @Test - public void testNormalization() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "tr") - .put("index.analysis.filter.myCollator.strength", "primary") - .put("index.analysis.filter.myCollator.decomposition", "canonical") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING"; - String turkishLowerCase = "ı will use turkish casıng"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase))); - TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase))); - assertCollatesToSame(tsUpper, tsLower); - } - - /* - * Test secondary strength, for english case is not significant. - */ - @Test - public void testSecondaryStrength() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "en") - .put("index.analysis.filter.myCollator.strength", "secondary") - .put("index.analysis.filter.myCollator.decomposition", "no") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String upperCase = "TESTING"; - String lowerCase = "testing"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upperCase))); - TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lowerCase))); - assertCollatesToSame(tsUpper, tsLower); - } - - /* - * Setting alternate=shifted to shift whitespace, punctuation and symbols - * to quaternary level - */ - @Test - public void testIgnorePunctuation() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "en") - .put("index.analysis.filter.myCollator.strength", "primary") - .put("index.analysis.filter.myCollator.alternate", "shifted") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String withPunctuation = "foo-bar"; - String withoutPunctuation = "foo bar"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation))); - TokenStream tsWithoutPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withoutPunctuation))); - assertCollatesToSame(tsPunctuation, tsWithoutPunctuation); - } - - /* - * Setting alternate=shifted and variableTop to shift whitespace, but not - * punctuation or symbols, to quaternary level - */ - @Test - public void testIgnoreWhitespace() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "en") - .put("index.analysis.filter.myCollator.strength", "primary") - .put("index.analysis.filter.myCollator.alternate", "shifted") - .put("index.analysis.filter.myCollator.variableTop", " ") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String withSpace = "foo bar"; - String withoutSpace = "foobar"; - String withPunctuation = "foo-bar"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace))); - TokenStream tsWithoutSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withoutSpace))); - assertCollatesToSame(tsWithSpace, tsWithoutSpace); - // now assert that punctuation still matters: foo-bar < foo bar - tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace))); - TokenStream tsWithPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation))); - assertCollation(tsWithPunctuation, tsWithSpace, -1); - } - - /* - * Setting numeric to encode digits with numeric value, so that - * foobar-9 sorts before foobar-10 - */ - @Test - public void testNumerics() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "en") - .put("index.analysis.filter.myCollator.numeric", "true") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String nine = "foobar-9"; - String ten = "foobar-10"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsNine = filterFactory.create(new KeywordTokenizer(new StringReader(nine))); - TokenStream tsTen = filterFactory.create(new KeywordTokenizer(new StringReader(ten))); - assertCollation(tsNine, tsTen, -1); - } - - /* - * Setting caseLevel=true to create an additional case level between - * secondary and tertiary - */ - @Test - public void testIgnoreAccentsButNotCase() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "en") - .put("index.analysis.filter.myCollator.strength", "primary") - .put("index.analysis.filter.myCollator.caseLevel", "true") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String withAccents = "résumé"; - String withoutAccents = "resume"; - String withAccentsUpperCase = "Résumé"; - String withoutAccentsUpperCase = "Resume"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsWithAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withAccents))); - TokenStream tsWithoutAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents))); - assertCollatesToSame(tsWithAccents, tsWithoutAccents); - - TokenStream tsWithAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withAccentsUpperCase))); - TokenStream tsWithoutAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase))); - assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase); - - // now assert that case still matters: resume < Resume - TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents))); - TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase))); - assertCollation(tsLower, tsUpper, -1); - } - - /* - * Setting caseFirst=upper to cause uppercase strings to sort - * before lowercase ones. - */ - @Test - public void testUpperCaseFirst() throws IOException { - Index index = new Index("test"); - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.language", "en") - .put("index.analysis.filter.myCollator.strength", "tertiary") - .put("index.analysis.filter.myCollator.caseFirst", "upper") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String lower = "resume"; - String upper = "Resume"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - - TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lower))); - TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upper))); - assertCollation(tsUpper, tsLower, -1); - } - - /* - * For german, you might want oe to sort and match with o umlaut. - * This is not the default, but you can make a customized ruleset to do this. - * - * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior. - * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383 - */ - @Test - public void testCustomRules() throws Exception { - RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE")); - String DIN5007_2_tailorings = - "& ae , a\u0308 & AE , A\u0308"+ - "& oe , o\u0308 & OE , O\u0308"+ - "& ue , u\u0308 & UE , u\u0308"; - - RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); - String tailoredRules = tailoredCollator.getRules(); - - Settings settings = ImmutableSettings.settingsBuilder() - .put("index.analysis.filter.myCollator.type", "icu_collation") - .put("index.analysis.filter.myCollator.rules", tailoredRules) - .put("index.analysis.filter.myCollator.strength", "primary") - .build(); - AnalysisService analysisService = createAnalysisService(settings); - - String germanUmlaut = "Töne"; - String germanOE = "Toene"; - TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); - TokenStream tsUmlaut = filterFactory.create(new KeywordTokenizer(new StringReader(germanUmlaut))); - TokenStream tsOE = filterFactory.create(new KeywordTokenizer(new StringReader(germanOE))); - assertCollatesToSame(tsUmlaut, tsOE); - } - - private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException { - assertCollation(stream1, stream2, 0); - } - - private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException { - CharTermAttribute term1 = stream1 - .addAttribute(CharTermAttribute.class); - CharTermAttribute term2 = stream2 - .addAttribute(CharTermAttribute.class); - - stream1.reset(); - stream2.reset(); - - assertThat(stream1.incrementToken(), equalTo(true)); - assertThat(stream2.incrementToken(), equalTo(true)); - assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison))); - assertThat(stream1.incrementToken(), equalTo(false)); - assertThat(stream2.incrementToken(), equalTo(false)); - } - -}