Exposed ICU collator options in IcuCollationTokenFilterFactory

Closes #6
2025-03-09 14:34:43 +00:00 · 2012-09-28 09:58:57 +02:00 · 2012-09-28 09:58:57 +02:00 · 59d7f5cc14
commit 59d7f5cc14
parent e7d045ed81
5 changed files with 416 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -103,6 +103,31 @@ And here is a sample of custom collation:
        }
    }

+Optional options:
+* `strength` - The strength property determines the minimum level of difference considered significant during comparison.
+ The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
+ Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
+ See ICU Collation:http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html documentation for a more detailed
+ explanation for the specific values.
+* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
+`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
+normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
+before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
+faster and more complete collation behavior. Since a great many of the world's languages do not require text
+normalization, most locales set `no` as the default decomposition mode.
+
+Expert options:
+* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
+ to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
+* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
+ strength is set to `primary` this will ignore accent differences.
+* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
+ for strength `tertiary`.
+* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
+ example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
+* `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
+* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
+ and Hiragana characters in `quaternary` strength .

 ICU Tokenizer
 -------------
--- a/pom.xml
+++ b/pom.xml
@ -68,6 +68,16 @@
            <artifactId>testng</artifactId>
            <version>6.8</version>
            <scope>test</scope>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.hamcrest</groupId>
+                    <artifactId>hamcrest-core</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>junit</groupId>
+                    <artifactId>junit</artifactId>
+                </exclusion>
+            </exclusions>
        </dependency>

        <dependency>
--- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java
@ -45,8 +45,6 @@ import java.io.IOException;
 * <p>The second option is to specify collation rules as defined in the <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
 * Collation customization</a> chapter in icu docs. The <tt>rules</tt> parameter can either embed the rules definition
 * in the settings or refer to an external location (preferable located under the <tt>config</tt> location, relative to it).
- *
- *
 */
 public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {

@ -96,6 +94,81 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
                collator = Collator.getInstance();
            }
        }
+
+        // set the strength flag, otherwise it will be the default.
+        String strength = settings.get("strength");
+        if (strength != null) {
+            if (strength.equalsIgnoreCase("primary")) {
+                collator.setStrength(Collator.PRIMARY);
+            } else if (strength.equalsIgnoreCase("secondary")) {
+                collator.setStrength(Collator.SECONDARY);
+            } else if (strength.equalsIgnoreCase("tertiary")) {
+                collator.setStrength(Collator.TERTIARY);
+            } else if (strength.equalsIgnoreCase("quaternary")) {
+                collator.setStrength(Collator.QUATERNARY);
+            } else if (strength.equalsIgnoreCase("identical")) {
+                collator.setStrength(Collator.IDENTICAL);
+            } else {
+                throw new ElasticSearchIllegalArgumentException("Invalid strength: " + strength);
+            }
+        }
+
+        // set the decomposition flag, otherwise it will be the default.
+        String decomposition = settings.get("decomposition");
+        if (decomposition != null) {
+            if (decomposition.equalsIgnoreCase("no")) {
+                collator.setDecomposition(Collator.NO_DECOMPOSITION);
+            } else if (decomposition.equalsIgnoreCase("canonical")) {
+                collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
+            } else {
+                throw new ElasticSearchIllegalArgumentException("Invalid decomposition: " + decomposition);
+            }
+        }
+
+        // expert options: concrete subclasses are always a RuleBasedCollator
+        RuleBasedCollator rbc = (RuleBasedCollator) collator;
+        String alternate = settings.get("alternate");
+        if (alternate != null) {
+            if (alternate.equalsIgnoreCase("shifted")) {
+                rbc.setAlternateHandlingShifted(true);
+            } else if (alternate.equalsIgnoreCase("non-ignorable")) {
+                rbc.setAlternateHandlingShifted(false);
+            } else {
+                throw new ElasticSearchIllegalArgumentException("Invalid alternate: " + alternate);
+            }
+        }
+
+        Boolean caseLevel = settings.getAsBoolean("caseLevel", null);
+        if (caseLevel != null) {
+            rbc.setCaseLevel(caseLevel);
+        }
+
+        String caseFirst = settings.get("caseFirst");
+        if (caseFirst != null) {
+            if (caseFirst.equalsIgnoreCase("lower")) {
+                rbc.setLowerCaseFirst(true);
+            } else if (caseFirst.equalsIgnoreCase("upper")) {
+                rbc.setUpperCaseFirst(true);
+            } else {
+                throw new ElasticSearchIllegalArgumentException("Invalid caseFirst: " + caseFirst);
+            }
+        }
+
+        Boolean numeric = settings.getAsBoolean("numeric", null);
+        if (numeric != null) {
+            rbc.setNumericCollation(numeric);
+        }
+
+        String variableTop = settings.get("variableTop");
+        if (variableTop != null) {
+            rbc.setVariableTop(variableTop);
+        }
+
+        Boolean hiraganaQuaternaryMode = settings.getAsBoolean("hiraganaQuaternaryMode", null);
+        if (hiraganaQuaternaryMode != null) {
+            rbc.setHiraganaQuaternary(hiraganaQuaternaryMode);
+        }
+
        this.collator = collator;
    }

--- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java
@ -29,10 +29,10 @@ import org.elasticsearch.index.IndexNameModule;
 import org.elasticsearch.index.settings.IndexSettingsModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisService;
-import org.hamcrest.MatcherAssert;
 import org.testng.annotations.Test;

 import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS;
+import static org.hamcrest.MatcherAssert.assertThat;
 import static org.hamcrest.Matchers.instanceOf;

 /**
@ -53,18 +53,18 @@ public class SimpleIcuAnalysisTests {
        AnalysisService analysisService = injector.getInstance(AnalysisService.class);

        TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer");
-        MatcherAssert.assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));
+        assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));

        TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer");
-        MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
+        assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));

        filterFactory = analysisService.tokenFilter("icu_folding");
-        MatcherAssert.assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
+        assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));

        filterFactory = analysisService.tokenFilter("icu_collation");
-        MatcherAssert.assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));
+        assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));

        filterFactory = analysisService.tokenFilter("icu_transform");
-        MatcherAssert.assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
+        assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
    }
 }
--- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java
@ -0,0 +1,300 @@
+package org.elasticsearch.index.analysis;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.elasticsearch.common.inject.Injector;
+import org.elasticsearch.common.inject.ModulesBuilder;
+import org.elasticsearch.common.settings.ImmutableSettings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.settings.SettingsModule;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.EnvironmentModule;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.IndexNameModule;
+import org.elasticsearch.index.settings.IndexSettingsModule;
+import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
+import org.elasticsearch.indices.analysis.IndicesAnalysisService;
+import org.testng.annotations.Test;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.equalTo;
+
+// Tests borrowed from Solr's Icu collation key filter factory test.
+public class SimpleIcuCollationTokenFilterTests {
+
+    /*
+    * Turkish has some funny casing.
+    * This test shows how you can solve this kind of thing easily with collation.
+    * Instead of using LowerCaseFilter, use a turkish collator with primary strength.
+    * Then things will sort and match correctly.
+    */
+    @Test
+    public void testBasicUsage() throws Exception {
+        Index index = new Index("test");
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "tr")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .build();
+        AnalysisService analysisService = createAnalysisService(index, settings);
+
+        String turkishUpperCase = "I WİLL USE TURKİSH CASING";
+        String turkishLowerCase = "ı will use turkish casıng";
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+
+        TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
+        TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
+        assertCollatesToSame(tsUpper, tsLower);
+    }
+
+    /*
+    * Test usage of the decomposition option for unicode normalization.
+    */
+    @Test
+    public void testNormalization() throws IOException {
+        Index index = new Index("test");
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "tr")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .put("index.analysis.filter.myCollator.decomposition", "canonical")
+                .build();
+        AnalysisService analysisService = createAnalysisService(index, settings);
+
+        String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
+        String turkishLowerCase = "ı will use turkish casıng";
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+
+        TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
+        TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
+        assertCollatesToSame(tsUpper, tsLower);
+    }
+
+    /*
+    * Test secondary strength, for english case is not significant.
+    */
+    @Test
+    public void testSecondaryStrength() throws IOException {
+        Index index = new Index("test");
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "secondary")
+                .put("index.analysis.filter.myCollator.decomposition", "no")
+                .build();
+        AnalysisService analysisService = createAnalysisService(index, settings);
+
+        String upperCase = "TESTING";
+        String lowerCase = "testing";
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+
+        TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upperCase)));
+        TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lowerCase)));
+        assertCollatesToSame(tsUpper, tsLower);
+    }
+
+    /*
+    * Setting alternate=shifted to shift whitespace, punctuation and symbols
+    * to quaternary level
+    */
+    @Test
+    public void testIgnorePunctuation() throws IOException {
+        Index index = new Index("test");
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .put("index.analysis.filter.myCollator.alternate", "shifted")
+                .build();
+        AnalysisService analysisService = createAnalysisService(index, settings);
+
+        String withPunctuation = "foo-bar";
+        String withoutPunctuation = "foo bar";
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+
+        TokenStream tsPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
+        TokenStream tsWithoutPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withoutPunctuation)));
+        assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
+    }
+
+    /*
+    * Setting alternate=shifted and variableTop to shift whitespace, but not
+    * punctuation or symbols, to quaternary level
+    */
+    @Test
+    public void testIgnoreWhitespace() throws IOException {
+        Index index = new Index("test");
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .put("index.analysis.filter.myCollator.alternate", "shifted")
+                .put("index.analysis.filter.myCollator.variableTop", " ")
+                .build();
+        AnalysisService analysisService = createAnalysisService(index, settings);
+
+        String withSpace = "foo bar";
+        String withoutSpace = "foobar";
+        String withPunctuation = "foo-bar";
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+
+        TokenStream tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
+        TokenStream tsWithoutSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withoutSpace)));
+        assertCollatesToSame(tsWithSpace, tsWithoutSpace);
+        // now assert that punctuation still matters: foo-bar < foo bar
+        tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
+        TokenStream tsWithPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
+        assertCollation(tsWithPunctuation, tsWithSpace, -1);
+    }
+
+    /*
+    * Setting numeric to encode digits with numeric value, so that
+    * foobar-9 sorts before foobar-10
+    */
+    @Test
+    public void testNumerics() throws IOException {
+        Index index = new Index("test");
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.numeric", "true")
+                .build();
+        AnalysisService analysisService = createAnalysisService(index, settings);
+
+        String nine = "foobar-9";
+        String ten = "foobar-10";
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+
+        TokenStream tsNine = filterFactory.create(new KeywordTokenizer(new StringReader(nine)));
+        TokenStream tsTen = filterFactory.create(new KeywordTokenizer(new StringReader(ten)));
+        assertCollation(tsNine, tsTen, -1);
+    }
+
+    /*
+    * Setting caseLevel=true to create an additional case level between
+    * secondary and tertiary
+    */
+    @Test
+    public void testIgnoreAccentsButNotCase() throws IOException {
+        Index index = new Index("test");
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .put("index.analysis.filter.myCollator.caseLevel", "true")
+                .build();
+        AnalysisService analysisService = createAnalysisService(index, settings);
+
+        String withAccents = "résumé";
+        String withoutAccents = "resume";
+        String withAccentsUpperCase = "Résumé";
+        String withoutAccentsUpperCase = "Resume";
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+
+        TokenStream tsWithAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withAccents)));
+        TokenStream tsWithoutAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
+        assertCollatesToSame(tsWithAccents, tsWithoutAccents);
+
+        TokenStream tsWithAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
+        TokenStream tsWithoutAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
+        assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
+
+        // now assert that case still matters: resume < Resume
+        TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
+        TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
+        assertCollation(tsLower, tsUpper, -1);
+    }
+
+    /*
+    * Setting caseFirst=upper to cause uppercase strings to sort
+    * before lowercase ones.
+    */
+    @Test
+    public void testUpperCaseFirst() throws IOException {
+        Index index = new Index("test");
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "tertiary")
+                .put("index.analysis.filter.myCollator.caseFirst", "upper")
+                .build();
+        AnalysisService analysisService = createAnalysisService(index, settings);
+
+        String lower = "resume";
+        String upper = "Resume";
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+
+        TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lower)));
+        TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upper)));
+        assertCollation(tsUpper, tsLower, -1);
+    }
+
+    /*
+    * For german, you might want oe to sort and match with o umlaut.
+    * This is not the default, but you can make a customized ruleset to do this.
+    *
+    * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
+    *  http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
+    */
+    @Test
+    public void testCustomRules() throws Exception {
+        RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
+        String DIN5007_2_tailorings =
+                "& ae , a\u0308 & AE , A\u0308"+
+                        "& oe , o\u0308 & OE , O\u0308"+
+                        "& ue , u\u0308 & UE , u\u0308";
+
+        RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
+        String tailoredRules = tailoredCollator.getRules();
+
+        Index index = new Index("test");
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.rules", tailoredRules)
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .build();
+        AnalysisService analysisService = createAnalysisService(index, settings);
+
+        String germanUmlaut = "Töne";
+        String germanOE = "Toene";
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        TokenStream tsUmlaut = filterFactory.create(new KeywordTokenizer(new StringReader(germanUmlaut)));
+        TokenStream tsOE = filterFactory.create(new KeywordTokenizer(new StringReader(germanOE)));
+        assertCollatesToSame(tsUmlaut, tsOE);
+    }
+
+    private AnalysisService createAnalysisService(Index index, Settings settings) {
+        Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
+        Injector injector = new ModulesBuilder().add(
+                new IndexSettingsModule(index, settings),
+                new IndexNameModule(index),
+                new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor()))
+                .createChildInjector(parentInjector);
+
+        return injector.getInstance(AnalysisService.class);
+    }
+
+    private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
+        assertCollation(stream1, stream2, 0);
+    }
+
+    private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
+        CharTermAttribute term1 = stream1
+                .addAttribute(CharTermAttribute.class);
+        CharTermAttribute term2 = stream2
+                .addAttribute(CharTermAttribute.class);
+        assertThat(stream1.incrementToken(), equalTo(true));
+        assertThat(stream2.incrementToken(), equalTo(true));
+        assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison)));
+        assertThat(stream1.incrementToken(), equalTo(false));
+        assertThat(stream2.incrementToken(), equalTo(false));
+    }
+
+}