Exposed ICU collator options in IcuCollationTokenFilterFactory
Closes #6
This commit is contained in:
parent
e7d045ed81
commit
59d7f5cc14
25
README.md
25
README.md
|
@ -103,6 +103,31 @@ And here is a sample of custom collation:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Optional options:
|
||||||
|
* `strength` - The strength property determines the minimum level of difference considered significant during comparison.
|
||||||
|
The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
|
||||||
|
Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
|
||||||
|
See ICU Collation:http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html documentation for a more detailed
|
||||||
|
explanation for the specific values.
|
||||||
|
* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
|
||||||
|
`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
|
||||||
|
normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
|
||||||
|
before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
|
||||||
|
faster and more complete collation behavior. Since a great many of the world's languages do not require text
|
||||||
|
normalization, most locales set `no` as the default decomposition mode.
|
||||||
|
|
||||||
|
Expert options:
|
||||||
|
* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
|
||||||
|
to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
|
||||||
|
* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
|
||||||
|
strength is set to `primary` this will ignore accent differences.
|
||||||
|
* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
|
||||||
|
for strength `tertiary`.
|
||||||
|
* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
|
||||||
|
example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
|
||||||
|
* `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
|
||||||
|
* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
|
||||||
|
and Hiragana characters in `quaternary` strength .
|
||||||
|
|
||||||
ICU Tokenizer
|
ICU Tokenizer
|
||||||
-------------
|
-------------
|
||||||
|
|
10
pom.xml
10
pom.xml
|
@ -68,6 +68,16 @@
|
||||||
<artifactId>testng</artifactId>
|
<artifactId>testng</artifactId>
|
||||||
<version>6.8</version>
|
<version>6.8</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>org.hamcrest</groupId>
|
||||||
|
<artifactId>hamcrest-core</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|
|
@ -45,8 +45,6 @@ import java.io.IOException;
|
||||||
* <p>The second option is to specify collation rules as defined in the <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
|
* <p>The second option is to specify collation rules as defined in the <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
|
||||||
* Collation customization</a> chapter in icu docs. The <tt>rules</tt> parameter can either embed the rules definition
|
* Collation customization</a> chapter in icu docs. The <tt>rules</tt> parameter can either embed the rules definition
|
||||||
* in the settings or refer to an external location (preferable located under the <tt>config</tt> location, relative to it).
|
* in the settings or refer to an external location (preferable located under the <tt>config</tt> location, relative to it).
|
||||||
*
|
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
|
public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
|
@ -96,6 +94,81 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
collator = Collator.getInstance();
|
collator = Collator.getInstance();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// set the strength flag, otherwise it will be the default.
|
||||||
|
String strength = settings.get("strength");
|
||||||
|
if (strength != null) {
|
||||||
|
if (strength.equalsIgnoreCase("primary")) {
|
||||||
|
collator.setStrength(Collator.PRIMARY);
|
||||||
|
} else if (strength.equalsIgnoreCase("secondary")) {
|
||||||
|
collator.setStrength(Collator.SECONDARY);
|
||||||
|
} else if (strength.equalsIgnoreCase("tertiary")) {
|
||||||
|
collator.setStrength(Collator.TERTIARY);
|
||||||
|
} else if (strength.equalsIgnoreCase("quaternary")) {
|
||||||
|
collator.setStrength(Collator.QUATERNARY);
|
||||||
|
} else if (strength.equalsIgnoreCase("identical")) {
|
||||||
|
collator.setStrength(Collator.IDENTICAL);
|
||||||
|
} else {
|
||||||
|
throw new ElasticSearchIllegalArgumentException("Invalid strength: " + strength);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// set the decomposition flag, otherwise it will be the default.
|
||||||
|
String decomposition = settings.get("decomposition");
|
||||||
|
if (decomposition != null) {
|
||||||
|
if (decomposition.equalsIgnoreCase("no")) {
|
||||||
|
collator.setDecomposition(Collator.NO_DECOMPOSITION);
|
||||||
|
} else if (decomposition.equalsIgnoreCase("canonical")) {
|
||||||
|
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
|
||||||
|
} else {
|
||||||
|
throw new ElasticSearchIllegalArgumentException("Invalid decomposition: " + decomposition);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// expert options: concrete subclasses are always a RuleBasedCollator
|
||||||
|
RuleBasedCollator rbc = (RuleBasedCollator) collator;
|
||||||
|
String alternate = settings.get("alternate");
|
||||||
|
if (alternate != null) {
|
||||||
|
if (alternate.equalsIgnoreCase("shifted")) {
|
||||||
|
rbc.setAlternateHandlingShifted(true);
|
||||||
|
} else if (alternate.equalsIgnoreCase("non-ignorable")) {
|
||||||
|
rbc.setAlternateHandlingShifted(false);
|
||||||
|
} else {
|
||||||
|
throw new ElasticSearchIllegalArgumentException("Invalid alternate: " + alternate);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Boolean caseLevel = settings.getAsBoolean("caseLevel", null);
|
||||||
|
if (caseLevel != null) {
|
||||||
|
rbc.setCaseLevel(caseLevel);
|
||||||
|
}
|
||||||
|
|
||||||
|
String caseFirst = settings.get("caseFirst");
|
||||||
|
if (caseFirst != null) {
|
||||||
|
if (caseFirst.equalsIgnoreCase("lower")) {
|
||||||
|
rbc.setLowerCaseFirst(true);
|
||||||
|
} else if (caseFirst.equalsIgnoreCase("upper")) {
|
||||||
|
rbc.setUpperCaseFirst(true);
|
||||||
|
} else {
|
||||||
|
throw new ElasticSearchIllegalArgumentException("Invalid caseFirst: " + caseFirst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Boolean numeric = settings.getAsBoolean("numeric", null);
|
||||||
|
if (numeric != null) {
|
||||||
|
rbc.setNumericCollation(numeric);
|
||||||
|
}
|
||||||
|
|
||||||
|
String variableTop = settings.get("variableTop");
|
||||||
|
if (variableTop != null) {
|
||||||
|
rbc.setVariableTop(variableTop);
|
||||||
|
}
|
||||||
|
|
||||||
|
Boolean hiraganaQuaternaryMode = settings.getAsBoolean("hiraganaQuaternaryMode", null);
|
||||||
|
if (hiraganaQuaternaryMode != null) {
|
||||||
|
rbc.setHiraganaQuaternary(hiraganaQuaternaryMode);
|
||||||
|
}
|
||||||
|
|
||||||
this.collator = collator;
|
this.collator = collator;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -29,10 +29,10 @@ import org.elasticsearch.index.IndexNameModule;
|
||||||
import org.elasticsearch.index.settings.IndexSettingsModule;
|
import org.elasticsearch.index.settings.IndexSettingsModule;
|
||||||
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
|
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
|
||||||
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
|
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
|
||||||
import org.hamcrest.MatcherAssert;
|
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS;
|
import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS;
|
||||||
|
import static org.hamcrest.MatcherAssert.assertThat;
|
||||||
import static org.hamcrest.Matchers.instanceOf;
|
import static org.hamcrest.Matchers.instanceOf;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -53,18 +53,18 @@ public class SimpleIcuAnalysisTests {
|
||||||
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
|
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
|
||||||
|
|
||||||
TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer");
|
TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer");
|
||||||
MatcherAssert.assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));
|
assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));
|
||||||
|
|
||||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer");
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer");
|
||||||
MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
|
assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
|
||||||
|
|
||||||
filterFactory = analysisService.tokenFilter("icu_folding");
|
filterFactory = analysisService.tokenFilter("icu_folding");
|
||||||
MatcherAssert.assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
|
assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
|
||||||
|
|
||||||
filterFactory = analysisService.tokenFilter("icu_collation");
|
filterFactory = analysisService.tokenFilter("icu_collation");
|
||||||
MatcherAssert.assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));
|
assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));
|
||||||
|
|
||||||
filterFactory = analysisService.tokenFilter("icu_transform");
|
filterFactory = analysisService.tokenFilter("icu_transform");
|
||||||
MatcherAssert.assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
|
assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,300 @@
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import com.ibm.icu.text.Collator;
|
||||||
|
import com.ibm.icu.text.RuleBasedCollator;
|
||||||
|
import com.ibm.icu.util.ULocale;
|
||||||
|
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.elasticsearch.common.inject.Injector;
|
||||||
|
import org.elasticsearch.common.inject.ModulesBuilder;
|
||||||
|
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.common.settings.SettingsModule;
|
||||||
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.env.EnvironmentModule;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.IndexNameModule;
|
||||||
|
import org.elasticsearch.index.settings.IndexSettingsModule;
|
||||||
|
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
|
||||||
|
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import static org.hamcrest.MatcherAssert.assertThat;
|
||||||
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
|
||||||
|
// Tests borrowed from Solr's Icu collation key filter factory test.
|
||||||
|
public class SimpleIcuCollationTokenFilterTests {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Turkish has some funny casing.
|
||||||
|
* This test shows how you can solve this kind of thing easily with collation.
|
||||||
|
* Instead of using LowerCaseFilter, use a turkish collator with primary strength.
|
||||||
|
* Then things will sort and match correctly.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testBasicUsage() throws Exception {
|
||||||
|
Index index = new Index("test");
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "tr")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(index, settings);
|
||||||
|
|
||||||
|
String turkishUpperCase = "I WİLL USE TURKİSH CASING";
|
||||||
|
String turkishLowerCase = "ı will use turkish casıng";
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
|
||||||
|
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
|
||||||
|
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
|
||||||
|
assertCollatesToSame(tsUpper, tsLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test usage of the decomposition option for unicode normalization.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testNormalization() throws IOException {
|
||||||
|
Index index = new Index("test");
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "tr")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||||
|
.put("index.analysis.filter.myCollator.decomposition", "canonical")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(index, settings);
|
||||||
|
|
||||||
|
String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
|
||||||
|
String turkishLowerCase = "ı will use turkish casıng";
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
|
||||||
|
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
|
||||||
|
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
|
||||||
|
assertCollatesToSame(tsUpper, tsLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test secondary strength, for english case is not significant.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testSecondaryStrength() throws IOException {
|
||||||
|
Index index = new Index("test");
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "en")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "secondary")
|
||||||
|
.put("index.analysis.filter.myCollator.decomposition", "no")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(index, settings);
|
||||||
|
|
||||||
|
String upperCase = "TESTING";
|
||||||
|
String lowerCase = "testing";
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
|
||||||
|
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upperCase)));
|
||||||
|
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lowerCase)));
|
||||||
|
assertCollatesToSame(tsUpper, tsLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Setting alternate=shifted to shift whitespace, punctuation and symbols
|
||||||
|
* to quaternary level
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testIgnorePunctuation() throws IOException {
|
||||||
|
Index index = new Index("test");
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "en")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||||
|
.put("index.analysis.filter.myCollator.alternate", "shifted")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(index, settings);
|
||||||
|
|
||||||
|
String withPunctuation = "foo-bar";
|
||||||
|
String withoutPunctuation = "foo bar";
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
|
||||||
|
TokenStream tsPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
|
||||||
|
TokenStream tsWithoutPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withoutPunctuation)));
|
||||||
|
assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Setting alternate=shifted and variableTop to shift whitespace, but not
|
||||||
|
* punctuation or symbols, to quaternary level
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testIgnoreWhitespace() throws IOException {
|
||||||
|
Index index = new Index("test");
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "en")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||||
|
.put("index.analysis.filter.myCollator.alternate", "shifted")
|
||||||
|
.put("index.analysis.filter.myCollator.variableTop", " ")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(index, settings);
|
||||||
|
|
||||||
|
String withSpace = "foo bar";
|
||||||
|
String withoutSpace = "foobar";
|
||||||
|
String withPunctuation = "foo-bar";
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
|
||||||
|
TokenStream tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
|
||||||
|
TokenStream tsWithoutSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withoutSpace)));
|
||||||
|
assertCollatesToSame(tsWithSpace, tsWithoutSpace);
|
||||||
|
// now assert that punctuation still matters: foo-bar < foo bar
|
||||||
|
tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
|
||||||
|
TokenStream tsWithPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
|
||||||
|
assertCollation(tsWithPunctuation, tsWithSpace, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Setting numeric to encode digits with numeric value, so that
|
||||||
|
* foobar-9 sorts before foobar-10
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testNumerics() throws IOException {
|
||||||
|
Index index = new Index("test");
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "en")
|
||||||
|
.put("index.analysis.filter.myCollator.numeric", "true")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(index, settings);
|
||||||
|
|
||||||
|
String nine = "foobar-9";
|
||||||
|
String ten = "foobar-10";
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
|
||||||
|
TokenStream tsNine = filterFactory.create(new KeywordTokenizer(new StringReader(nine)));
|
||||||
|
TokenStream tsTen = filterFactory.create(new KeywordTokenizer(new StringReader(ten)));
|
||||||
|
assertCollation(tsNine, tsTen, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Setting caseLevel=true to create an additional case level between
|
||||||
|
* secondary and tertiary
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testIgnoreAccentsButNotCase() throws IOException {
|
||||||
|
Index index = new Index("test");
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "en")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||||
|
.put("index.analysis.filter.myCollator.caseLevel", "true")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(index, settings);
|
||||||
|
|
||||||
|
String withAccents = "résumé";
|
||||||
|
String withoutAccents = "resume";
|
||||||
|
String withAccentsUpperCase = "Résumé";
|
||||||
|
String withoutAccentsUpperCase = "Resume";
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
|
||||||
|
TokenStream tsWithAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withAccents)));
|
||||||
|
TokenStream tsWithoutAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
|
||||||
|
assertCollatesToSame(tsWithAccents, tsWithoutAccents);
|
||||||
|
|
||||||
|
TokenStream tsWithAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
|
||||||
|
TokenStream tsWithoutAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
|
||||||
|
assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
|
||||||
|
|
||||||
|
// now assert that case still matters: resume < Resume
|
||||||
|
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
|
||||||
|
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
|
||||||
|
assertCollation(tsLower, tsUpper, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Setting caseFirst=upper to cause uppercase strings to sort
|
||||||
|
* before lowercase ones.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testUpperCaseFirst() throws IOException {
|
||||||
|
Index index = new Index("test");
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "en")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "tertiary")
|
||||||
|
.put("index.analysis.filter.myCollator.caseFirst", "upper")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(index, settings);
|
||||||
|
|
||||||
|
String lower = "resume";
|
||||||
|
String upper = "Resume";
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
|
||||||
|
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lower)));
|
||||||
|
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upper)));
|
||||||
|
assertCollation(tsUpper, tsLower, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For german, you might want oe to sort and match with o umlaut.
|
||||||
|
* This is not the default, but you can make a customized ruleset to do this.
|
||||||
|
*
|
||||||
|
* The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
|
||||||
|
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testCustomRules() throws Exception {
|
||||||
|
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
|
||||||
|
String DIN5007_2_tailorings =
|
||||||
|
"& ae , a\u0308 & AE , A\u0308"+
|
||||||
|
"& oe , o\u0308 & OE , O\u0308"+
|
||||||
|
"& ue , u\u0308 & UE , u\u0308";
|
||||||
|
|
||||||
|
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
|
||||||
|
String tailoredRules = tailoredCollator.getRules();
|
||||||
|
|
||||||
|
Index index = new Index("test");
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.rules", tailoredRules)
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(index, settings);
|
||||||
|
|
||||||
|
String germanUmlaut = "Töne";
|
||||||
|
String germanOE = "Toene";
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
TokenStream tsUmlaut = filterFactory.create(new KeywordTokenizer(new StringReader(germanUmlaut)));
|
||||||
|
TokenStream tsOE = filterFactory.create(new KeywordTokenizer(new StringReader(germanOE)));
|
||||||
|
assertCollatesToSame(tsUmlaut, tsOE);
|
||||||
|
}
|
||||||
|
|
||||||
|
private AnalysisService createAnalysisService(Index index, Settings settings) {
|
||||||
|
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
|
||||||
|
Injector injector = new ModulesBuilder().add(
|
||||||
|
new IndexSettingsModule(index, settings),
|
||||||
|
new IndexNameModule(index),
|
||||||
|
new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor()))
|
||||||
|
.createChildInjector(parentInjector);
|
||||||
|
|
||||||
|
return injector.getInstance(AnalysisService.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
|
||||||
|
assertCollation(stream1, stream2, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
|
||||||
|
CharTermAttribute term1 = stream1
|
||||||
|
.addAttribute(CharTermAttribute.class);
|
||||||
|
CharTermAttribute term2 = stream2
|
||||||
|
.addAttribute(CharTermAttribute.class);
|
||||||
|
assertThat(stream1.incrementToken(), equalTo(true));
|
||||||
|
assertThat(stream2.incrementToken(), equalTo(true));
|
||||||
|
assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison)));
|
||||||
|
assertThat(stream1.incrementToken(), equalTo(false));
|
||||||
|
assertThat(stream2.incrementToken(), equalTo(false));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue