Exposed ICU collator options in IcuCollationTokenFilterFactory

Closes #6
This commit is contained in:
Martijn van Groningen 2012-09-28 09:58:57 +02:00
parent e7d045ed81
commit 59d7f5cc14
5 changed files with 416 additions and 8 deletions

View File

@ -103,6 +103,31 @@ And here is a sample of custom collation:
} }
} }
Optional options:
* `strength` - The strength property determines the minimum level of difference considered significant during comparison.
The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
See ICU Collation:http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html documentation for a more detailed
explanation for the specific values.
* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
faster and more complete collation behavior. Since a great many of the world's languages do not require text
normalization, most locales set `no` as the default decomposition mode.
Expert options:
* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
strength is set to `primary` this will ignore accent differences.
* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
for strength `tertiary`.
* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
* `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
and Hiragana characters in `quaternary` strength .
ICU Tokenizer ICU Tokenizer
------------- -------------

10
pom.xml
View File

@ -68,6 +68,16 @@
<artifactId>testng</artifactId> <artifactId>testng</artifactId>
<version>6.8</version> <version>6.8</version>
<scope>test</scope> <scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-core</artifactId>
</exclusion>
<exclusion>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency> <dependency>

View File

@ -45,8 +45,6 @@ import java.io.IOException;
* <p>The second option is to specify collation rules as defined in the <a href="http://www.icu-project.org/userguide/Collate_Customization.html"> * <p>The second option is to specify collation rules as defined in the <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
* Collation customization</a> chapter in icu docs. The <tt>rules</tt> parameter can either embed the rules definition * Collation customization</a> chapter in icu docs. The <tt>rules</tt> parameter can either embed the rules definition
* in the settings or refer to an external location (preferable located under the <tt>config</tt> location, relative to it). * in the settings or refer to an external location (preferable located under the <tt>config</tt> location, relative to it).
*
*
*/ */
public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
@ -96,6 +94,81 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
collator = Collator.getInstance(); collator = Collator.getInstance();
} }
} }
// set the strength flag, otherwise it will be the default.
String strength = settings.get("strength");
if (strength != null) {
if (strength.equalsIgnoreCase("primary")) {
collator.setStrength(Collator.PRIMARY);
} else if (strength.equalsIgnoreCase("secondary")) {
collator.setStrength(Collator.SECONDARY);
} else if (strength.equalsIgnoreCase("tertiary")) {
collator.setStrength(Collator.TERTIARY);
} else if (strength.equalsIgnoreCase("quaternary")) {
collator.setStrength(Collator.QUATERNARY);
} else if (strength.equalsIgnoreCase("identical")) {
collator.setStrength(Collator.IDENTICAL);
} else {
throw new ElasticSearchIllegalArgumentException("Invalid strength: " + strength);
}
}
// set the decomposition flag, otherwise it will be the default.
String decomposition = settings.get("decomposition");
if (decomposition != null) {
if (decomposition.equalsIgnoreCase("no")) {
collator.setDecomposition(Collator.NO_DECOMPOSITION);
} else if (decomposition.equalsIgnoreCase("canonical")) {
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
} else {
throw new ElasticSearchIllegalArgumentException("Invalid decomposition: " + decomposition);
}
}
// expert options: concrete subclasses are always a RuleBasedCollator
RuleBasedCollator rbc = (RuleBasedCollator) collator;
String alternate = settings.get("alternate");
if (alternate != null) {
if (alternate.equalsIgnoreCase("shifted")) {
rbc.setAlternateHandlingShifted(true);
} else if (alternate.equalsIgnoreCase("non-ignorable")) {
rbc.setAlternateHandlingShifted(false);
} else {
throw new ElasticSearchIllegalArgumentException("Invalid alternate: " + alternate);
}
}
Boolean caseLevel = settings.getAsBoolean("caseLevel", null);
if (caseLevel != null) {
rbc.setCaseLevel(caseLevel);
}
String caseFirst = settings.get("caseFirst");
if (caseFirst != null) {
if (caseFirst.equalsIgnoreCase("lower")) {
rbc.setLowerCaseFirst(true);
} else if (caseFirst.equalsIgnoreCase("upper")) {
rbc.setUpperCaseFirst(true);
} else {
throw new ElasticSearchIllegalArgumentException("Invalid caseFirst: " + caseFirst);
}
}
Boolean numeric = settings.getAsBoolean("numeric", null);
if (numeric != null) {
rbc.setNumericCollation(numeric);
}
String variableTop = settings.get("variableTop");
if (variableTop != null) {
rbc.setVariableTop(variableTop);
}
Boolean hiraganaQuaternaryMode = settings.getAsBoolean("hiraganaQuaternaryMode", null);
if (hiraganaQuaternaryMode != null) {
rbc.setHiraganaQuaternary(hiraganaQuaternaryMode);
}
this.collator = collator; this.collator = collator;
} }

View File

@ -29,10 +29,10 @@ import org.elasticsearch.index.IndexNameModule;
import org.elasticsearch.index.settings.IndexSettingsModule; import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisModule; import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisService; import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import org.hamcrest.MatcherAssert;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS; import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.instanceOf;
/** /**
@ -53,18 +53,18 @@ public class SimpleIcuAnalysisTests {
AnalysisService analysisService = injector.getInstance(AnalysisService.class); AnalysisService analysisService = injector.getInstance(AnalysisService.class);
TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer"); TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer");
MatcherAssert.assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class)); assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));
TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer"); TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer");
MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class)); assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
filterFactory = analysisService.tokenFilter("icu_folding"); filterFactory = analysisService.tokenFilter("icu_folding");
MatcherAssert.assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class)); assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
filterFactory = analysisService.tokenFilter("icu_collation"); filterFactory = analysisService.tokenFilter("icu_collation");
MatcherAssert.assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class)); assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));
filterFactory = analysisService.tokenFilter("icu_transform"); filterFactory = analysisService.tokenFilter("icu_transform");
MatcherAssert.assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class)); assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
} }
} }

View File

@ -0,0 +1,300 @@
package org.elasticsearch.index.analysis;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.inject.ModulesBuilder;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.SettingsModule;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.EnvironmentModule;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexNameModule;
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import org.testng.annotations.Test;
import java.io.IOException;
import java.io.StringReader;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
// Tests borrowed from Solr's Icu collation key filter factory test.
public class SimpleIcuCollationTokenFilterTests {
/*
* Turkish has some funny casing.
* This test shows how you can solve this kind of thing easily with collation.
* Instead of using LowerCaseFilter, use a turkish collator with primary strength.
* Then things will sort and match correctly.
*/
@Test
public void testBasicUsage() throws Exception {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "tr")
.put("index.analysis.filter.myCollator.strength", "primary")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String turkishUpperCase = "I WİLL USE TURKİSH CASING";
String turkishLowerCase = "ı will use turkish casıng";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Test usage of the decomposition option for unicode normalization.
*/
@Test
public void testNormalization() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "tr")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.decomposition", "canonical")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
String turkishLowerCase = "ı will use turkish casıng";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Test secondary strength, for english case is not significant.
*/
@Test
public void testSecondaryStrength() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "secondary")
.put("index.analysis.filter.myCollator.decomposition", "no")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String upperCase = "TESTING";
String lowerCase = "testing";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upperCase)));
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Setting alternate=shifted to shift whitespace, punctuation and symbols
* to quaternary level
*/
@Test
public void testIgnorePunctuation() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.alternate", "shifted")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String withPunctuation = "foo-bar";
String withoutPunctuation = "foo bar";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
TokenStream tsWithoutPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withoutPunctuation)));
assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
}
/*
* Setting alternate=shifted and variableTop to shift whitespace, but not
* punctuation or symbols, to quaternary level
*/
@Test
public void testIgnoreWhitespace() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.alternate", "shifted")
.put("index.analysis.filter.myCollator.variableTop", " ")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String withSpace = "foo bar";
String withoutSpace = "foobar";
String withPunctuation = "foo-bar";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
TokenStream tsWithoutSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withoutSpace)));
assertCollatesToSame(tsWithSpace, tsWithoutSpace);
// now assert that punctuation still matters: foo-bar < foo bar
tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
TokenStream tsWithPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
assertCollation(tsWithPunctuation, tsWithSpace, -1);
}
/*
* Setting numeric to encode digits with numeric value, so that
* foobar-9 sorts before foobar-10
*/
@Test
public void testNumerics() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.numeric", "true")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String nine = "foobar-9";
String ten = "foobar-10";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsNine = filterFactory.create(new KeywordTokenizer(new StringReader(nine)));
TokenStream tsTen = filterFactory.create(new KeywordTokenizer(new StringReader(ten)));
assertCollation(tsNine, tsTen, -1);
}
/*
* Setting caseLevel=true to create an additional case level between
* secondary and tertiary
*/
@Test
public void testIgnoreAccentsButNotCase() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.caseLevel", "true")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String withAccents = "résumé";
String withoutAccents = "resume";
String withAccentsUpperCase = "Résumé";
String withoutAccentsUpperCase = "Resume";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsWithAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withAccents)));
TokenStream tsWithoutAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
assertCollatesToSame(tsWithAccents, tsWithoutAccents);
TokenStream tsWithAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
TokenStream tsWithoutAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
// now assert that case still matters: resume < Resume
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
assertCollation(tsLower, tsUpper, -1);
}
/*
* Setting caseFirst=upper to cause uppercase strings to sort
* before lowercase ones.
*/
@Test
public void testUpperCaseFirst() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "tertiary")
.put("index.analysis.filter.myCollator.caseFirst", "upper")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String lower = "resume";
String upper = "Resume";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lower)));
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upper)));
assertCollation(tsUpper, tsLower, -1);
}
/*
* For german, you might want oe to sort and match with o umlaut.
* This is not the default, but you can make a customized ruleset to do this.
*
* The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
*/
@Test
public void testCustomRules() throws Exception {
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
String DIN5007_2_tailorings =
"& ae , a\u0308 & AE , A\u0308"+
"& oe , o\u0308 & OE , O\u0308"+
"& ue , u\u0308 & UE , u\u0308";
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.rules", tailoredRules)
.put("index.analysis.filter.myCollator.strength", "primary")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String germanUmlaut = "Töne";
String germanOE = "Toene";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsUmlaut = filterFactory.create(new KeywordTokenizer(new StringReader(germanUmlaut)));
TokenStream tsOE = filterFactory.create(new KeywordTokenizer(new StringReader(germanOE)));
assertCollatesToSame(tsUmlaut, tsOE);
}
private AnalysisService createAnalysisService(Index index, Settings settings) {
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
Injector injector = new ModulesBuilder().add(
new IndexSettingsModule(index, settings),
new IndexNameModule(index),
new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor()))
.createChildInjector(parentInjector);
return injector.getInstance(AnalysisService.class);
}
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
assertCollation(stream1, stream2, 0);
}
private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
CharTermAttribute term1 = stream1
.addAttribute(CharTermAttribute.class);
CharTermAttribute term2 = stream2
.addAttribute(CharTermAttribute.class);
assertThat(stream1.incrementToken(), equalTo(true));
assertThat(stream2.incrementToken(), equalTo(true));
assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison)));
assertThat(stream1.incrementToken(), equalTo(false));
assertThat(stream2.incrementToken(), equalTo(false));
}
}