Exposed ICU collator options in IcuCollationTokenFilterFactory

Closes #6
2012-09-28 09:58:57 +02:00 · 2012-09-28 09:58:57 +02:00 · 59d7f5cc14
parent e7d045ed81
commit 59d7f5cc14
5 changed files with 416 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -103,6 +103,31 @@ And here is a sample of custom collation:
        }
    }
 Optional options:
 * `strength` - The strength property determines the minimum level of difference considered significant during comparison.
 The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
 Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
 See ICU Collation:http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html documentation for a more detailed
 explanation for the specific values.
 * `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
 `canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
 normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
 before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
 faster and more complete collation behavior. Since a great many of the world's languages do not require text
 normalization, most locales set `no` as the default decomposition mode.
 Expert options:
 * `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
 to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
 * `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
 strength is set to `primary` this will ignore accent differences.
 * `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
 for strength `tertiary`.
 * `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
 example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
 * `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
 * `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
 and Hiragana characters in `quaternary` strength .
 ICU Tokenizer
 -------------
--- a/pom.xml
+++ b/pom.xml
@ -68,6 +68,16 @@
            <artifactId>testng</artifactId>
            <version>6.8</version>
            <scope>test</scope>
            <exclusions>
                <exclusion>
                    <groupId>org.hamcrest</groupId>
                    <artifactId>hamcrest-core</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>junit</groupId>
                    <artifactId>junit</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
--- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java
@ -45,8 +45,6 @@ import java.io.IOException;
 * <p>The second option is to specify collation rules as defined in the <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
 * Collation customization</a> chapter in icu docs. The <tt>rules</tt> parameter can either embed the rules definition
 * in the settings or refer to an external location (preferable located under the <tt>config</tt> location, relative to it).
 *
 *
 */
 public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
@ -96,6 +94,81 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
                collator = Collator.getInstance();
            }
        }
        // set the strength flag, otherwise it will be the default.
        String strength = settings.get("strength");
        if (strength != null) {
            if (strength.equalsIgnoreCase("primary")) {
                collator.setStrength(Collator.PRIMARY);
            } else if (strength.equalsIgnoreCase("secondary")) {
                collator.setStrength(Collator.SECONDARY);
            } else if (strength.equalsIgnoreCase("tertiary")) {
                collator.setStrength(Collator.TERTIARY);
            } else if (strength.equalsIgnoreCase("quaternary")) {
                collator.setStrength(Collator.QUATERNARY);
            } else if (strength.equalsIgnoreCase("identical")) {
                collator.setStrength(Collator.IDENTICAL);
            } else {
                throw new ElasticSearchIllegalArgumentException("Invalid strength: " + strength);
            }
        }
        // set the decomposition flag, otherwise it will be the default.
        String decomposition = settings.get("decomposition");
        if (decomposition != null) {
            if (decomposition.equalsIgnoreCase("no")) {
                collator.setDecomposition(Collator.NO_DECOMPOSITION);
            } else if (decomposition.equalsIgnoreCase("canonical")) {
                collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
            } else {
                throw new ElasticSearchIllegalArgumentException("Invalid decomposition: " + decomposition);
            }
        }
        // expert options: concrete subclasses are always a RuleBasedCollator
        RuleBasedCollator rbc = (RuleBasedCollator) collator;
        String alternate = settings.get("alternate");
        if (alternate != null) {
            if (alternate.equalsIgnoreCase("shifted")) {
                rbc.setAlternateHandlingShifted(true);
            } else if (alternate.equalsIgnoreCase("non-ignorable")) {
                rbc.setAlternateHandlingShifted(false);
            } else {
                throw new ElasticSearchIllegalArgumentException("Invalid alternate: " + alternate);
            }
        }
        Boolean caseLevel = settings.getAsBoolean("caseLevel", null);
        if (caseLevel != null) {
            rbc.setCaseLevel(caseLevel);
        }
        String caseFirst = settings.get("caseFirst");
        if (caseFirst != null) {
            if (caseFirst.equalsIgnoreCase("lower")) {
                rbc.setLowerCaseFirst(true);
            } else if (caseFirst.equalsIgnoreCase("upper")) {
                rbc.setUpperCaseFirst(true);
            } else {
                throw new ElasticSearchIllegalArgumentException("Invalid caseFirst: " + caseFirst);
            }
        }
        Boolean numeric = settings.getAsBoolean("numeric", null);
        if (numeric != null) {
            rbc.setNumericCollation(numeric);
        }
        String variableTop = settings.get("variableTop");
        if (variableTop != null) {
            rbc.setVariableTop(variableTop);
        }
        Boolean hiraganaQuaternaryMode = settings.getAsBoolean("hiraganaQuaternaryMode", null);
        if (hiraganaQuaternaryMode != null) {
            rbc.setHiraganaQuaternary(hiraganaQuaternaryMode);
        }
        this.collator = collator;
    }
--- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java
@ -29,10 +29,10 @@ import org.elasticsearch.index.IndexNameModule;
 import org.elasticsearch.index.settings.IndexSettingsModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisService;
 import org.hamcrest.MatcherAssert;
 import org.testng.annotations.Test;
 import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS;
 import static org.hamcrest.MatcherAssert.assertThat;
 import static org.hamcrest.Matchers.instanceOf;
 /**
@ -53,18 +53,18 @@ public class SimpleIcuAnalysisTests {
        AnalysisService analysisService = injector.getInstance(AnalysisService.class);
        TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer");
-        MatcherAssert.assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));
+        assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));
        TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer");
-        MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
+        assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
        filterFactory = analysisService.tokenFilter("icu_folding");
-        MatcherAssert.assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
+        assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
        filterFactory = analysisService.tokenFilter("icu_collation");
-        MatcherAssert.assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));
+        assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));
        filterFactory = analysisService.tokenFilter("icu_transform");
-        MatcherAssert.assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
+        assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
    }
 }
--- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java
@ -0,0 +1,300 @@
 package org.elasticsearch.index.analysis;
 import com.ibm.icu.text.Collator;
 import com.ibm.icu.text.RuleBasedCollator;
 import com.ibm.icu.util.ULocale;
 import org.apache.lucene.analysis.KeywordTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.elasticsearch.common.inject.Injector;
 import org.elasticsearch.common.inject.ModulesBuilder;
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.settings.SettingsModule;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.env.EnvironmentModule;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexNameModule;
 import org.elasticsearch.index.settings.IndexSettingsModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisService;
 import org.testng.annotations.Test;
 import java.io.IOException;
 import java.io.StringReader;
 import static org.hamcrest.MatcherAssert.assertThat;
 import static org.hamcrest.Matchers.equalTo;
 // Tests borrowed from Solr's Icu collation key filter factory test.
 public class SimpleIcuCollationTokenFilterTests {
    /*
    * Turkish has some funny casing.
    * This test shows how you can solve this kind of thing easily with collation.
    * Instead of using LowerCaseFilter, use a turkish collator with primary strength.
    * Then things will sort and match correctly.
    */
    @Test
    public void testBasicUsage() throws Exception {
        Index index = new Index("test");
        Settings settings = ImmutableSettings.settingsBuilder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "tr")
                .put("index.analysis.filter.myCollator.strength", "primary")
                .build();
        AnalysisService analysisService = createAnalysisService(index, settings);
        String turkishUpperCase = "I WİLL USE TURKİSH CASING";
        String turkishLowerCase = "ı will use turkish casıng";
        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
        TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
        TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
        assertCollatesToSame(tsUpper, tsLower);
    }
    /*
    * Test usage of the decomposition option for unicode normalization.
    */
    @Test
    public void testNormalization() throws IOException {
        Index index = new Index("test");
        Settings settings = ImmutableSettings.settingsBuilder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "tr")
                .put("index.analysis.filter.myCollator.strength", "primary")
                .put("index.analysis.filter.myCollator.decomposition", "canonical")
                .build();
        AnalysisService analysisService = createAnalysisService(index, settings);
        String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
        String turkishLowerCase = "ı will use turkish casıng";
        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
        TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
        TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
        assertCollatesToSame(tsUpper, tsLower);
    }
    /*
    * Test secondary strength, for english case is not significant.
    */
    @Test
    public void testSecondaryStrength() throws IOException {
        Index index = new Index("test");
        Settings settings = ImmutableSettings.settingsBuilder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "en")
                .put("index.analysis.filter.myCollator.strength", "secondary")
                .put("index.analysis.filter.myCollator.decomposition", "no")
                .build();
        AnalysisService analysisService = createAnalysisService(index, settings);
        String upperCase = "TESTING";
        String lowerCase = "testing";
        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
        TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upperCase)));
        TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lowerCase)));
        assertCollatesToSame(tsUpper, tsLower);
    }
    /*
    * Setting alternate=shifted to shift whitespace, punctuation and symbols
    * to quaternary level
    */
    @Test
    public void testIgnorePunctuation() throws IOException {
        Index index = new Index("test");
        Settings settings = ImmutableSettings.settingsBuilder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "en")
                .put("index.analysis.filter.myCollator.strength", "primary")
                .put("index.analysis.filter.myCollator.alternate", "shifted")
                .build();
        AnalysisService analysisService = createAnalysisService(index, settings);
        String withPunctuation = "foo-bar";
        String withoutPunctuation = "foo bar";
        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
        TokenStream tsPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
        TokenStream tsWithoutPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withoutPunctuation)));
        assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
    }
    /*
    * Setting alternate=shifted and variableTop to shift whitespace, but not
    * punctuation or symbols, to quaternary level
    */
    @Test
    public void testIgnoreWhitespace() throws IOException {
        Index index = new Index("test");
        Settings settings = ImmutableSettings.settingsBuilder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "en")
                .put("index.analysis.filter.myCollator.strength", "primary")
                .put("index.analysis.filter.myCollator.alternate", "shifted")
                .put("index.analysis.filter.myCollator.variableTop", " ")
                .build();
        AnalysisService analysisService = createAnalysisService(index, settings);
        String withSpace = "foo bar";
        String withoutSpace = "foobar";
        String withPunctuation = "foo-bar";
        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
        TokenStream tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
        TokenStream tsWithoutSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withoutSpace)));
        assertCollatesToSame(tsWithSpace, tsWithoutSpace);
        // now assert that punctuation still matters: foo-bar < foo bar
        tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
        TokenStream tsWithPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
        assertCollation(tsWithPunctuation, tsWithSpace, -1);
    }
    /*
    * Setting numeric to encode digits with numeric value, so that
    * foobar-9 sorts before foobar-10
    */
    @Test
    public void testNumerics() throws IOException {
        Index index = new Index("test");
        Settings settings = ImmutableSettings.settingsBuilder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "en")
                .put("index.analysis.filter.myCollator.numeric", "true")
                .build();
        AnalysisService analysisService = createAnalysisService(index, settings);
        String nine = "foobar-9";
        String ten = "foobar-10";
        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
        TokenStream tsNine = filterFactory.create(new KeywordTokenizer(new StringReader(nine)));
        TokenStream tsTen = filterFactory.create(new KeywordTokenizer(new StringReader(ten)));
        assertCollation(tsNine, tsTen, -1);
    }
    /*
    * Setting caseLevel=true to create an additional case level between
    * secondary and tertiary
    */
    @Test
    public void testIgnoreAccentsButNotCase() throws IOException {
        Index index = new Index("test");
        Settings settings = ImmutableSettings.settingsBuilder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "en")
                .put("index.analysis.filter.myCollator.strength", "primary")
                .put("index.analysis.filter.myCollator.caseLevel", "true")
                .build();
        AnalysisService analysisService = createAnalysisService(index, settings);
        String withAccents = "résumé";
        String withoutAccents = "resume";
        String withAccentsUpperCase = "Résumé";
        String withoutAccentsUpperCase = "Resume";
        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
        TokenStream tsWithAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withAccents)));
        TokenStream tsWithoutAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
        assertCollatesToSame(tsWithAccents, tsWithoutAccents);
        TokenStream tsWithAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
        TokenStream tsWithoutAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
        assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
        // now assert that case still matters: resume < Resume
        TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
        TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
        assertCollation(tsLower, tsUpper, -1);
    }
    /*
    * Setting caseFirst=upper to cause uppercase strings to sort
    * before lowercase ones.
    */
    @Test
    public void testUpperCaseFirst() throws IOException {
        Index index = new Index("test");
        Settings settings = ImmutableSettings.settingsBuilder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "en")
                .put("index.analysis.filter.myCollator.strength", "tertiary")
                .put("index.analysis.filter.myCollator.caseFirst", "upper")
                .build();
        AnalysisService analysisService = createAnalysisService(index, settings);
        String lower = "resume";
        String upper = "Resume";
        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
        TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lower)));
        TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upper)));
        assertCollation(tsUpper, tsLower, -1);
    }
    /*
    * For german, you might want oe to sort and match with o umlaut.
    * This is not the default, but you can make a customized ruleset to do this.
    *
    * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
    *  http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
    */
    @Test
    public void testCustomRules() throws Exception {
        RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
        String DIN5007_2_tailorings =
                "& ae , a\u0308 & AE , A\u0308"+
                        "& oe , o\u0308 & OE , O\u0308"+
                        "& ue , u\u0308 & UE , u\u0308";
        RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
        String tailoredRules = tailoredCollator.getRules();
        Index index = new Index("test");
        Settings settings = ImmutableSettings.settingsBuilder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.rules", tailoredRules)
                .put("index.analysis.filter.myCollator.strength", "primary")
                .build();
        AnalysisService analysisService = createAnalysisService(index, settings);
        String germanUmlaut = "Töne";
        String germanOE = "Toene";
        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
        TokenStream tsUmlaut = filterFactory.create(new KeywordTokenizer(new StringReader(germanUmlaut)));
        TokenStream tsOE = filterFactory.create(new KeywordTokenizer(new StringReader(germanOE)));
        assertCollatesToSame(tsUmlaut, tsOE);
    }
    private AnalysisService createAnalysisService(Index index, Settings settings) {
        Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
        Injector injector = new ModulesBuilder().add(
                new IndexSettingsModule(index, settings),
                new IndexNameModule(index),
                new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor()))
                .createChildInjector(parentInjector);
        return injector.getInstance(AnalysisService.class);
    }
    private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
        assertCollation(stream1, stream2, 0);
    }
    private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
        CharTermAttribute term1 = stream1
                .addAttribute(CharTermAttribute.class);
        CharTermAttribute term2 = stream2
                .addAttribute(CharTermAttribute.class);
        assertThat(stream1.incrementToken(), equalTo(true));
        assertThat(stream2.incrementToken(), equalTo(true));
        assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison)));
        assertThat(stream1.incrementToken(), equalTo(false));
        assertThat(stream2.incrementToken(), equalTo(false));
    }
 }