From 59d7f5cc14dd68166c512098e5ee1cea490191c9 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 28 Sep 2012 09:58:57 +0200 Subject: [PATCH] Exposed ICU collator options in IcuCollationTokenFilterFactory Closes #6 --- README.md | 25 ++ pom.xml | 10 + .../IcuCollationTokenFilterFactory.java | 77 ++++- .../analysis/SimpleIcuAnalysisTests.java | 12 +- .../SimpleIcuCollationTokenFilterTests.java | 300 ++++++++++++++++++ 5 files changed, 416 insertions(+), 8 deletions(-) create mode 100644 src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java diff --git a/README.md b/README.md index de8dd59d8f7..f29c1d5f00f 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,31 @@ And here is a sample of custom collation: } } +Optional options: +* `strength` - The strength property determines the minimum level of difference considered significant during comparison. + The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator. + Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`. + See ICU Collation:http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html documentation for a more detailed + explanation for the specific values. +* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with +`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were +normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form +before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between +faster and more complete collation behavior. Since a great many of the world's languages do not require text +normalization, most locales set `no` as the default decomposition mode. + +Expert options: +* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary` + to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace. +* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When + strength is set to `primary` this will ignore accent differences. +* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored + for strength `tertiary`. +* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For + example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`. +* `variableTop` - Single character or contraction. Controls what is variable for `alternate`. +* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana + and Hiragana characters in `quaternary` strength . ICU Tokenizer ------------- diff --git a/pom.xml b/pom.xml index 6265c581d1f..f7881eaaf03 100644 --- a/pom.xml +++ b/pom.xml @@ -68,6 +68,16 @@ testng 6.8 test + + + org.hamcrest + hamcrest-core + + + junit + junit + + diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java index 6c98d767022..d756e978b64 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -45,8 +45,6 @@ import java.io.IOException; *

The second option is to specify collation rules as defined in the * Collation customization chapter in icu docs. The rules parameter can either embed the rules definition * in the settings or refer to an external location (preferable located under the config location, relative to it). - * - * */ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { @@ -96,6 +94,81 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { collator = Collator.getInstance(); } } + + // set the strength flag, otherwise it will be the default. + String strength = settings.get("strength"); + if (strength != null) { + if (strength.equalsIgnoreCase("primary")) { + collator.setStrength(Collator.PRIMARY); + } else if (strength.equalsIgnoreCase("secondary")) { + collator.setStrength(Collator.SECONDARY); + } else if (strength.equalsIgnoreCase("tertiary")) { + collator.setStrength(Collator.TERTIARY); + } else if (strength.equalsIgnoreCase("quaternary")) { + collator.setStrength(Collator.QUATERNARY); + } else if (strength.equalsIgnoreCase("identical")) { + collator.setStrength(Collator.IDENTICAL); + } else { + throw new ElasticSearchIllegalArgumentException("Invalid strength: " + strength); + } + } + + // set the decomposition flag, otherwise it will be the default. + String decomposition = settings.get("decomposition"); + if (decomposition != null) { + if (decomposition.equalsIgnoreCase("no")) { + collator.setDecomposition(Collator.NO_DECOMPOSITION); + } else if (decomposition.equalsIgnoreCase("canonical")) { + collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); + } else { + throw new ElasticSearchIllegalArgumentException("Invalid decomposition: " + decomposition); + } + } + + // expert options: concrete subclasses are always a RuleBasedCollator + RuleBasedCollator rbc = (RuleBasedCollator) collator; + String alternate = settings.get("alternate"); + if (alternate != null) { + if (alternate.equalsIgnoreCase("shifted")) { + rbc.setAlternateHandlingShifted(true); + } else if (alternate.equalsIgnoreCase("non-ignorable")) { + rbc.setAlternateHandlingShifted(false); + } else { + throw new ElasticSearchIllegalArgumentException("Invalid alternate: " + alternate); + } + } + + Boolean caseLevel = settings.getAsBoolean("caseLevel", null); + if (caseLevel != null) { + rbc.setCaseLevel(caseLevel); + } + + String caseFirst = settings.get("caseFirst"); + if (caseFirst != null) { + if (caseFirst.equalsIgnoreCase("lower")) { + rbc.setLowerCaseFirst(true); + } else if (caseFirst.equalsIgnoreCase("upper")) { + rbc.setUpperCaseFirst(true); + } else { + throw new ElasticSearchIllegalArgumentException("Invalid caseFirst: " + caseFirst); + } + } + + Boolean numeric = settings.getAsBoolean("numeric", null); + if (numeric != null) { + rbc.setNumericCollation(numeric); + } + + String variableTop = settings.get("variableTop"); + if (variableTop != null) { + rbc.setVariableTop(variableTop); + } + + Boolean hiraganaQuaternaryMode = settings.getAsBoolean("hiraganaQuaternaryMode", null); + if (hiraganaQuaternaryMode != null) { + rbc.setHiraganaQuaternary(hiraganaQuaternaryMode); + } + this.collator = collator; } diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java index a3e55a7a10e..43df9d270be 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -29,10 +29,10 @@ import org.elasticsearch.index.IndexNameModule; import org.elasticsearch.index.settings.IndexSettingsModule; import org.elasticsearch.indices.analysis.IndicesAnalysisModule; import org.elasticsearch.indices.analysis.IndicesAnalysisService; -import org.hamcrest.MatcherAssert; import org.testng.annotations.Test; import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS; +import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.instanceOf; /** @@ -53,18 +53,18 @@ public class SimpleIcuAnalysisTests { AnalysisService analysisService = injector.getInstance(AnalysisService.class); TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer"); - MatcherAssert.assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class)); + assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class)); TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer"); - MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class)); + assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class)); filterFactory = analysisService.tokenFilter("icu_folding"); - MatcherAssert.assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class)); + assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class)); filterFactory = analysisService.tokenFilter("icu_collation"); - MatcherAssert.assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class)); + assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class)); filterFactory = analysisService.tokenFilter("icu_transform"); - MatcherAssert.assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class)); + assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class)); } } diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java new file mode 100644 index 00000000000..5d0b60b9575 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java @@ -0,0 +1,300 @@ +package org.elasticsearch.index.analysis; + +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.util.ULocale; +import org.apache.lucene.analysis.KeywordTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.io.StringReader; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; + +// Tests borrowed from Solr's Icu collation key filter factory test. +public class SimpleIcuCollationTokenFilterTests { + + /* + * Turkish has some funny casing. + * This test shows how you can solve this kind of thing easily with collation. + * Instead of using LowerCaseFilter, use a turkish collator with primary strength. + * Then things will sort and match correctly. + */ + @Test + public void testBasicUsage() throws Exception { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "tr") + .put("index.analysis.filter.myCollator.strength", "primary") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String turkishUpperCase = "I WİLL USE TURKİSH CASING"; + String turkishLowerCase = "ı will use turkish casıng"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase))); + TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase))); + assertCollatesToSame(tsUpper, tsLower); + } + + /* + * Test usage of the decomposition option for unicode normalization. + */ + @Test + public void testNormalization() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "tr") + .put("index.analysis.filter.myCollator.strength", "primary") + .put("index.analysis.filter.myCollator.decomposition", "canonical") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING"; + String turkishLowerCase = "ı will use turkish casıng"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase))); + TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase))); + assertCollatesToSame(tsUpper, tsLower); + } + + /* + * Test secondary strength, for english case is not significant. + */ + @Test + public void testSecondaryStrength() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "secondary") + .put("index.analysis.filter.myCollator.decomposition", "no") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String upperCase = "TESTING"; + String lowerCase = "testing"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upperCase))); + TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lowerCase))); + assertCollatesToSame(tsUpper, tsLower); + } + + /* + * Setting alternate=shifted to shift whitespace, punctuation and symbols + * to quaternary level + */ + @Test + public void testIgnorePunctuation() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "primary") + .put("index.analysis.filter.myCollator.alternate", "shifted") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String withPunctuation = "foo-bar"; + String withoutPunctuation = "foo bar"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation))); + TokenStream tsWithoutPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withoutPunctuation))); + assertCollatesToSame(tsPunctuation, tsWithoutPunctuation); + } + + /* + * Setting alternate=shifted and variableTop to shift whitespace, but not + * punctuation or symbols, to quaternary level + */ + @Test + public void testIgnoreWhitespace() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "primary") + .put("index.analysis.filter.myCollator.alternate", "shifted") + .put("index.analysis.filter.myCollator.variableTop", " ") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String withSpace = "foo bar"; + String withoutSpace = "foobar"; + String withPunctuation = "foo-bar"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace))); + TokenStream tsWithoutSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withoutSpace))); + assertCollatesToSame(tsWithSpace, tsWithoutSpace); + // now assert that punctuation still matters: foo-bar < foo bar + tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace))); + TokenStream tsWithPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation))); + assertCollation(tsWithPunctuation, tsWithSpace, -1); + } + + /* + * Setting numeric to encode digits with numeric value, so that + * foobar-9 sorts before foobar-10 + */ + @Test + public void testNumerics() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.numeric", "true") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String nine = "foobar-9"; + String ten = "foobar-10"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsNine = filterFactory.create(new KeywordTokenizer(new StringReader(nine))); + TokenStream tsTen = filterFactory.create(new KeywordTokenizer(new StringReader(ten))); + assertCollation(tsNine, tsTen, -1); + } + + /* + * Setting caseLevel=true to create an additional case level between + * secondary and tertiary + */ + @Test + public void testIgnoreAccentsButNotCase() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "primary") + .put("index.analysis.filter.myCollator.caseLevel", "true") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String withAccents = "résumé"; + String withoutAccents = "resume"; + String withAccentsUpperCase = "Résumé"; + String withoutAccentsUpperCase = "Resume"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsWithAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withAccents))); + TokenStream tsWithoutAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents))); + assertCollatesToSame(tsWithAccents, tsWithoutAccents); + + TokenStream tsWithAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withAccentsUpperCase))); + TokenStream tsWithoutAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase))); + assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase); + + // now assert that case still matters: resume < Resume + TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents))); + TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase))); + assertCollation(tsLower, tsUpper, -1); + } + + /* + * Setting caseFirst=upper to cause uppercase strings to sort + * before lowercase ones. + */ + @Test + public void testUpperCaseFirst() throws IOException { + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.language", "en") + .put("index.analysis.filter.myCollator.strength", "tertiary") + .put("index.analysis.filter.myCollator.caseFirst", "upper") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String lower = "resume"; + String upper = "Resume"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + + TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lower))); + TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upper))); + assertCollation(tsUpper, tsLower, -1); + } + + /* + * For german, you might want oe to sort and match with o umlaut. + * This is not the default, but you can make a customized ruleset to do this. + * + * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior. + * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383 + */ + @Test + public void testCustomRules() throws Exception { + RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE")); + String DIN5007_2_tailorings = + "& ae , a\u0308 & AE , A\u0308"+ + "& oe , o\u0308 & OE , O\u0308"+ + "& ue , u\u0308 & UE , u\u0308"; + + RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); + String tailoredRules = tailoredCollator.getRules(); + + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.myCollator.type", "icu_collation") + .put("index.analysis.filter.myCollator.rules", tailoredRules) + .put("index.analysis.filter.myCollator.strength", "primary") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + + String germanUmlaut = "Töne"; + String germanOE = "Toene"; + TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator"); + TokenStream tsUmlaut = filterFactory.create(new KeywordTokenizer(new StringReader(germanUmlaut))); + TokenStream tsOE = filterFactory.create(new KeywordTokenizer(new StringReader(germanOE))); + assertCollatesToSame(tsUmlaut, tsOE); + } + + private AnalysisService createAnalysisService(Index index, Settings settings) { + Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(index, settings), + new IndexNameModule(index), + new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor())) + .createChildInjector(parentInjector); + + return injector.getInstance(AnalysisService.class); + } + + private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException { + assertCollation(stream1, stream2, 0); + } + + private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException { + CharTermAttribute term1 = stream1 + .addAttribute(CharTermAttribute.class); + CharTermAttribute term2 = stream2 + .addAttribute(CharTermAttribute.class); + assertThat(stream1.incrementToken(), equalTo(true)); + assertThat(stream2.incrementToken(), equalTo(true)); + assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison))); + assertThat(stream1.incrementToken(), equalTo(false)); + assertThat(stream2.incrementToken(), equalTo(false)); + } + +}