upgrade to lucene 5 snapshot (will open issue about collators)
This commit is contained in:
parent
472c21a138
commit
c2c0345837
83
README.md
83
README.md
|
@ -41,7 +41,7 @@ Normalizes characters as explained [here](http://userguide.icu-project.org/trans
|
|||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"collation" : {
|
||||
"normalized" : {
|
||||
"tokenizer" : "keyword",
|
||||
"filter" : ["icu_normalizer"]
|
||||
}
|
||||
|
@ -61,7 +61,7 @@ Folding of unicode characters based on `UTR#30`. It registers itself under `icu_
|
|||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"collation" : {
|
||||
"folded" : {
|
||||
"tokenizer" : "keyword",
|
||||
"filter" : ["icu_folding"]
|
||||
}
|
||||
|
@ -101,81 +101,6 @@ The Following example exempts Swedish characters from the folding. Note that the
|
|||
}
|
||||
```
|
||||
|
||||
ICU Collation
|
||||
-------------
|
||||
|
||||
Uses collation token filter. Allows to either specify the rules for collation
|
||||
(defined [here](http://www.icu-project.org/userguide/Collate_Customization.html)) using the `rules` parameter
|
||||
(can point to a location or expressed in the settings, location can be relative to config location), or using the
|
||||
`language` parameter (further specialized by country and variant). By default registers under `icu_collation` or
|
||||
`icuCollation` and uses the default locale.
|
||||
|
||||
Here is a sample settings:
|
||||
|
||||
```js
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"collation" : {
|
||||
"tokenizer" : "keyword",
|
||||
"filter" : ["icu_collation"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
And here is a sample of custom collation:
|
||||
|
||||
```js
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"collation" : {
|
||||
"tokenizer" : "keyword",
|
||||
"filter" : ["myCollator"]
|
||||
}
|
||||
},
|
||||
"filter" : {
|
||||
"myCollator" : {
|
||||
"type" : "icu_collation",
|
||||
"language" : "en"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Optional options:
|
||||
* `strength` - The strength property determines the minimum level of difference considered significant during comparison.
|
||||
The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
|
||||
Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
|
||||
See [ICU Collation](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html) documentation for a more detailed
|
||||
explanation for the specific values.
|
||||
* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
|
||||
`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
|
||||
normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
|
||||
before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
|
||||
faster and more complete collation behavior. Since a great many of the world's languages do not require text
|
||||
normalization, most locales set `no` as the default decomposition mode.
|
||||
|
||||
Expert options:
|
||||
* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
|
||||
to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
|
||||
* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
|
||||
strength is set to `primary` this will ignore accent differences.
|
||||
* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
|
||||
for strength `tertiary`.
|
||||
* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
|
||||
example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
|
||||
* `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
|
||||
* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
|
||||
and Hiragana characters in `quaternary` strength .
|
||||
|
||||
ICU Tokenizer
|
||||
-------------
|
||||
|
||||
|
@ -186,7 +111,7 @@ Breaks text into words according to [UAX #29: Unicode Text Segmentation](http://
|
|||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"collation" : {
|
||||
"tokenized" : {
|
||||
"tokenizer" : "icu_tokenizer",
|
||||
}
|
||||
}
|
||||
|
@ -211,7 +136,7 @@ Here is a sample settings:
|
|||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"collation" : {
|
||||
"normalized" : {
|
||||
"tokenizer" : "keyword",
|
||||
"char_filter" : ["icu_normalizer"]
|
||||
}
|
||||
|
|
8
pom.xml
8
pom.xml
|
@ -33,8 +33,8 @@
|
|||
|
||||
<properties>
|
||||
<elasticsearch.version>2.0.0-SNAPSHOT</elasticsearch.version>
|
||||
<lucene.version>4.10.2</lucene.version>
|
||||
<lucene.maven.version>4.10.2</lucene.maven.version>
|
||||
<lucene.version>5.0.0</lucene.version>
|
||||
<lucene.maven.version>5.0.0-snapshot-1636426</lucene.maven.version>
|
||||
<tests.jvms>1</tests.jvms>
|
||||
<tests.shuffle>true</tests.shuffle>
|
||||
<tests.output>onerror</tests.output>
|
||||
|
@ -47,6 +47,10 @@
|
|||
<id>sonatype</id>
|
||||
<url>http://oss.sonatype.org/content/repositories/releases/</url>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>Lucene snapshots</id>
|
||||
<url>https://download.elasticsearch.org/lucenesnapshots/maven/</url>
|
||||
</repository>
|
||||
</repositories>
|
||||
|
||||
<dependencies>
|
||||
|
|
|
@ -23,7 +23,6 @@ import com.ibm.icu.text.Collator;
|
|||
import com.ibm.icu.text.RuleBasedCollator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.collation.ICUCollationKeyFilter;
|
||||
import org.elasticsearch.ElasticsearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
|
@ -174,6 +173,7 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ICUCollationKeyFilter(tokenStream, collator);
|
||||
throw new UnsupportedOperationException("i was deprecated in lucene 4, and now i'm gone");
|
||||
// TODO: lucene does sort keys as binary keys since 4.x
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,8 +39,8 @@ public class IcuTokenizerFactory extends AbstractTokenizerFactory {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create(Reader reader) {
|
||||
return new ICUTokenizer(reader);
|
||||
public Tokenizer create() {
|
||||
return new ICUTokenizer();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
|
||||
package org.elasticsearch.indices.analysis;
|
||||
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -27,7 +26,6 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
|
||||
import org.apache.lucene.analysis.icu.ICUTransformFilter;
|
||||
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
|
||||
import org.apache.lucene.collation.ICUCollationKeyFilter;
|
||||
import org.elasticsearch.common.component.AbstractComponent;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
|
@ -36,8 +34,6 @@ import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
|
|||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* Registers indices level analysis components so, if not explicitly configured, will be shared
|
||||
* among all indices.
|
||||
|
@ -55,8 +51,8 @@ public class IcuIndicesAnalysis extends AbstractComponent {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create(Reader reader) {
|
||||
return new ICUTokenizer(reader);
|
||||
public Tokenizer create() {
|
||||
return new ICUTokenizer();
|
||||
}
|
||||
}));
|
||||
|
||||
|
@ -85,18 +81,6 @@ public class IcuIndicesAnalysis extends AbstractComponent {
|
|||
}
|
||||
}));
|
||||
|
||||
indicesAnalysisService.tokenFilterFactories().put("icu_collation", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "icu_collation";
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ICUCollationKeyFilter(tokenStream, Collator.getInstance());
|
||||
}
|
||||
}));
|
||||
|
||||
indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
|
|
|
@ -52,10 +52,8 @@ public class ICUIntegrationTests extends ElasticsearchIntegrationTest {
|
|||
Settings settings = ImmutableSettings.builder()
|
||||
.put(super.indexSettings())
|
||||
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_collator")
|
||||
.put("index.analysis.filter.my_collator.type", "icu_collation")
|
||||
.put("index.analysis.filter.my_collator.language", "en")
|
||||
.put("index.analysis.filter.my_collator.strength", "primary")
|
||||
.putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "my_folding")
|
||||
.put("index.analysis.filter.my_folding.type", "icu_folding")
|
||||
.build();
|
||||
|
||||
return settings;
|
||||
|
|
|
@ -1,303 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.RuleBasedCollator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
||||
// Tests borrowed from Solr's Icu collation key filter factory test.
|
||||
public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase {
|
||||
|
||||
/*
|
||||
* Turkish has some funny casing.
|
||||
* This test shows how you can solve this kind of thing easily with collation.
|
||||
* Instead of using LowerCaseFilter, use a turkish collator with primary strength.
|
||||
* Then things will sort and match correctly.
|
||||
*/
|
||||
@Test
|
||||
public void testBasicUsage() throws Exception {
|
||||
Index index = new Index("test");
|
||||
Settings settings = ImmutableSettings.settingsBuilder()
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "tr")
|
||||
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
String turkishUpperCase = "I WİLL USE TURKİSH CASING";
|
||||
String turkishLowerCase = "ı will use turkish casıng";
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
|
||||
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
|
||||
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
|
||||
assertCollatesToSame(tsUpper, tsLower);
|
||||
}
|
||||
|
||||
/*
|
||||
* Test usage of the decomposition option for unicode normalization.
|
||||
*/
|
||||
@Test
|
||||
public void testNormalization() throws IOException {
|
||||
Index index = new Index("test");
|
||||
Settings settings = ImmutableSettings.settingsBuilder()
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "tr")
|
||||
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||
.put("index.analysis.filter.myCollator.decomposition", "canonical")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
|
||||
String turkishLowerCase = "ı will use turkish casıng";
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
|
||||
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
|
||||
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
|
||||
assertCollatesToSame(tsUpper, tsLower);
|
||||
}
|
||||
|
||||
/*
|
||||
* Test secondary strength, for english case is not significant.
|
||||
*/
|
||||
@Test
|
||||
public void testSecondaryStrength() throws IOException {
|
||||
Index index = new Index("test");
|
||||
Settings settings = ImmutableSettings.settingsBuilder()
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "en")
|
||||
.put("index.analysis.filter.myCollator.strength", "secondary")
|
||||
.put("index.analysis.filter.myCollator.decomposition", "no")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
String upperCase = "TESTING";
|
||||
String lowerCase = "testing";
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
|
||||
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upperCase)));
|
||||
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lowerCase)));
|
||||
assertCollatesToSame(tsUpper, tsLower);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting alternate=shifted to shift whitespace, punctuation and symbols
|
||||
* to quaternary level
|
||||
*/
|
||||
@Test
|
||||
public void testIgnorePunctuation() throws IOException {
|
||||
Index index = new Index("test");
|
||||
Settings settings = ImmutableSettings.settingsBuilder()
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "en")
|
||||
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||
.put("index.analysis.filter.myCollator.alternate", "shifted")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
String withPunctuation = "foo-bar";
|
||||
String withoutPunctuation = "foo bar";
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
|
||||
TokenStream tsPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
|
||||
TokenStream tsWithoutPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withoutPunctuation)));
|
||||
assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting alternate=shifted and variableTop to shift whitespace, but not
|
||||
* punctuation or symbols, to quaternary level
|
||||
*/
|
||||
@Test
|
||||
public void testIgnoreWhitespace() throws IOException {
|
||||
Index index = new Index("test");
|
||||
Settings settings = ImmutableSettings.settingsBuilder()
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "en")
|
||||
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||
.put("index.analysis.filter.myCollator.alternate", "shifted")
|
||||
.put("index.analysis.filter.myCollator.variableTop", " ")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
String withSpace = "foo bar";
|
||||
String withoutSpace = "foobar";
|
||||
String withPunctuation = "foo-bar";
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
|
||||
TokenStream tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
|
||||
TokenStream tsWithoutSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withoutSpace)));
|
||||
assertCollatesToSame(tsWithSpace, tsWithoutSpace);
|
||||
// now assert that punctuation still matters: foo-bar < foo bar
|
||||
tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
|
||||
TokenStream tsWithPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
|
||||
assertCollation(tsWithPunctuation, tsWithSpace, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting numeric to encode digits with numeric value, so that
|
||||
* foobar-9 sorts before foobar-10
|
||||
*/
|
||||
@Test
|
||||
public void testNumerics() throws IOException {
|
||||
Index index = new Index("test");
|
||||
Settings settings = ImmutableSettings.settingsBuilder()
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "en")
|
||||
.put("index.analysis.filter.myCollator.numeric", "true")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
String nine = "foobar-9";
|
||||
String ten = "foobar-10";
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
|
||||
TokenStream tsNine = filterFactory.create(new KeywordTokenizer(new StringReader(nine)));
|
||||
TokenStream tsTen = filterFactory.create(new KeywordTokenizer(new StringReader(ten)));
|
||||
assertCollation(tsNine, tsTen, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting caseLevel=true to create an additional case level between
|
||||
* secondary and tertiary
|
||||
*/
|
||||
@Test
|
||||
public void testIgnoreAccentsButNotCase() throws IOException {
|
||||
Index index = new Index("test");
|
||||
Settings settings = ImmutableSettings.settingsBuilder()
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "en")
|
||||
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||
.put("index.analysis.filter.myCollator.caseLevel", "true")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
String withAccents = "résumé";
|
||||
String withoutAccents = "resume";
|
||||
String withAccentsUpperCase = "Résumé";
|
||||
String withoutAccentsUpperCase = "Resume";
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
|
||||
TokenStream tsWithAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withAccents)));
|
||||
TokenStream tsWithoutAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
|
||||
assertCollatesToSame(tsWithAccents, tsWithoutAccents);
|
||||
|
||||
TokenStream tsWithAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
|
||||
TokenStream tsWithoutAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
|
||||
assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
|
||||
|
||||
// now assert that case still matters: resume < Resume
|
||||
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
|
||||
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
|
||||
assertCollation(tsLower, tsUpper, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting caseFirst=upper to cause uppercase strings to sort
|
||||
* before lowercase ones.
|
||||
*/
|
||||
@Test
|
||||
public void testUpperCaseFirst() throws IOException {
|
||||
Index index = new Index("test");
|
||||
Settings settings = ImmutableSettings.settingsBuilder()
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "en")
|
||||
.put("index.analysis.filter.myCollator.strength", "tertiary")
|
||||
.put("index.analysis.filter.myCollator.caseFirst", "upper")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
String lower = "resume";
|
||||
String upper = "Resume";
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
|
||||
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lower)));
|
||||
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upper)));
|
||||
assertCollation(tsUpper, tsLower, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* For german, you might want oe to sort and match with o umlaut.
|
||||
* This is not the default, but you can make a customized ruleset to do this.
|
||||
*
|
||||
* The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
|
||||
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
|
||||
*/
|
||||
@Test
|
||||
public void testCustomRules() throws Exception {
|
||||
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
|
||||
String DIN5007_2_tailorings =
|
||||
"& ae , a\u0308 & AE , A\u0308"+
|
||||
"& oe , o\u0308 & OE , O\u0308"+
|
||||
"& ue , u\u0308 & UE , u\u0308";
|
||||
|
||||
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
|
||||
String tailoredRules = tailoredCollator.getRules();
|
||||
|
||||
Settings settings = ImmutableSettings.settingsBuilder()
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.rules", tailoredRules)
|
||||
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
String germanUmlaut = "Töne";
|
||||
String germanOE = "Toene";
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
TokenStream tsUmlaut = filterFactory.create(new KeywordTokenizer(new StringReader(germanUmlaut)));
|
||||
TokenStream tsOE = filterFactory.create(new KeywordTokenizer(new StringReader(germanOE)));
|
||||
assertCollatesToSame(tsUmlaut, tsOE);
|
||||
}
|
||||
|
||||
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
|
||||
assertCollation(stream1, stream2, 0);
|
||||
}
|
||||
|
||||
private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
|
||||
CharTermAttribute term1 = stream1
|
||||
.addAttribute(CharTermAttribute.class);
|
||||
CharTermAttribute term2 = stream2
|
||||
.addAttribute(CharTermAttribute.class);
|
||||
|
||||
stream1.reset();
|
||||
stream2.reset();
|
||||
|
||||
assertThat(stream1.incrementToken(), equalTo(true));
|
||||
assertThat(stream2.incrementToken(), equalTo(true));
|
||||
assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison)));
|
||||
assertThat(stream1.incrementToken(), equalTo(false));
|
||||
assertThat(stream2.incrementToken(), equalTo(false));
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue