upgrade to lucene 5 snapshot (will open issue about collators)

This commit is contained in:
Robert Muir 2014-11-05 16:25:33 -05:00
parent 472c21a138
commit c2c0345837
7 changed files with 18 additions and 410 deletions

View File

@ -41,7 +41,7 @@ Normalizes characters as explained [here](http://userguide.icu-project.org/trans
"index" : {
"analysis" : {
"analyzer" : {
"collation" : {
"normalized" : {
"tokenizer" : "keyword",
"filter" : ["icu_normalizer"]
}
@ -61,7 +61,7 @@ Folding of unicode characters based on `UTR#30`. It registers itself under `icu_
"index" : {
"analysis" : {
"analyzer" : {
"collation" : {
"folded" : {
"tokenizer" : "keyword",
"filter" : ["icu_folding"]
}
@ -101,81 +101,6 @@ The Following example exempts Swedish characters from the folding. Note that the
}
```
ICU Collation
-------------
Uses collation token filter. Allows to either specify the rules for collation
(defined [here](http://www.icu-project.org/userguide/Collate_Customization.html)) using the `rules` parameter
(can point to a location or expressed in the settings, location can be relative to config location), or using the
`language` parameter (further specialized by country and variant). By default registers under `icu_collation` or
`icuCollation` and uses the default locale.
Here is a sample settings:
```js
{
"index" : {
"analysis" : {
"analyzer" : {
"collation" : {
"tokenizer" : "keyword",
"filter" : ["icu_collation"]
}
}
}
}
}
```
And here is a sample of custom collation:
```js
{
"index" : {
"analysis" : {
"analyzer" : {
"collation" : {
"tokenizer" : "keyword",
"filter" : ["myCollator"]
}
},
"filter" : {
"myCollator" : {
"type" : "icu_collation",
"language" : "en"
}
}
}
}
}
```
Optional options:
* `strength` - The strength property determines the minimum level of difference considered significant during comparison.
The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
See [ICU Collation](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html) documentation for a more detailed
explanation for the specific values.
* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
faster and more complete collation behavior. Since a great many of the world's languages do not require text
normalization, most locales set `no` as the default decomposition mode.
Expert options:
* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
strength is set to `primary` this will ignore accent differences.
* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
for strength `tertiary`.
* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
* `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
and Hiragana characters in `quaternary` strength .
ICU Tokenizer
-------------
@ -186,7 +111,7 @@ Breaks text into words according to [UAX #29: Unicode Text Segmentation](http://
"index" : {
"analysis" : {
"analyzer" : {
"collation" : {
"tokenized" : {
"tokenizer" : "icu_tokenizer",
}
}
@ -211,7 +136,7 @@ Here is a sample settings:
"index" : {
"analysis" : {
"analyzer" : {
"collation" : {
"normalized" : {
"tokenizer" : "keyword",
"char_filter" : ["icu_normalizer"]
}

View File

@ -33,8 +33,8 @@
<properties>
<elasticsearch.version>2.0.0-SNAPSHOT</elasticsearch.version>
<lucene.version>4.10.2</lucene.version>
<lucene.maven.version>4.10.2</lucene.maven.version>
<lucene.version>5.0.0</lucene.version>
<lucene.maven.version>5.0.0-snapshot-1636426</lucene.maven.version>
<tests.jvms>1</tests.jvms>
<tests.shuffle>true</tests.shuffle>
<tests.output>onerror</tests.output>
@ -47,6 +47,10 @@
<id>sonatype</id>
<url>http://oss.sonatype.org/content/repositories/releases/</url>
</repository>
<repository>
<id>Lucene snapshots</id>
<url>https://download.elasticsearch.org/lucenesnapshots/maven/</url>
</repository>
</repositories>
<dependencies>

View File

@ -23,7 +23,6 @@ import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.collation.ICUCollationKeyFilter;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
@ -174,6 +173,7 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
@Override
public TokenStream create(TokenStream tokenStream) {
return new ICUCollationKeyFilter(tokenStream, collator);
throw new UnsupportedOperationException("i was deprecated in lucene 4, and now i'm gone");
// TODO: lucene does sort keys as binary keys since 4.x
}
}

View File

@ -39,8 +39,8 @@ public class IcuTokenizerFactory extends AbstractTokenizerFactory {
}
@Override
public Tokenizer create(Reader reader) {
return new ICUTokenizer(reader);
public Tokenizer create() {
return new ICUTokenizer();
}
}

View File

@ -19,7 +19,6 @@
package org.elasticsearch.indices.analysis;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.Transliterator;
import org.apache.lucene.analysis.TokenStream;
@ -27,7 +26,6 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
import org.apache.lucene.analysis.icu.ICUTransformFilter;
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
import org.apache.lucene.collation.ICUCollationKeyFilter;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
@ -36,8 +34,6 @@ import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import java.io.Reader;
/**
* Registers indices level analysis components so, if not explicitly configured, will be shared
* among all indices.
@ -55,8 +51,8 @@ public class IcuIndicesAnalysis extends AbstractComponent {
}
@Override
public Tokenizer create(Reader reader) {
return new ICUTokenizer(reader);
public Tokenizer create() {
return new ICUTokenizer();
}
}));
@ -85,18 +81,6 @@ public class IcuIndicesAnalysis extends AbstractComponent {
}
}));
indicesAnalysisService.tokenFilterFactories().put("icu_collation", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
@Override
public String name() {
return "icu_collation";
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new ICUCollationKeyFilter(tokenStream, Collator.getInstance());
}
}));
indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
@Override
public String name() {

View File

@ -52,10 +52,8 @@ public class ICUIntegrationTests extends ElasticsearchIntegrationTest {
Settings settings = ImmutableSettings.builder()
.put(super.indexSettings())
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
.putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_collator")
.put("index.analysis.filter.my_collator.type", "icu_collation")
.put("index.analysis.filter.my_collator.language", "en")
.put("index.analysis.filter.my_collator.strength", "primary")
.putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "my_folding")
.put("index.analysis.filter.my_folding.type", "icu_folding")
.build();
return settings;

View File

@ -1,303 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.test.ElasticsearchTestCase;
import org.junit.Test;
import java.io.IOException;
import java.io.StringReader;
import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService;
import static org.hamcrest.Matchers.equalTo;
// Tests borrowed from Solr's Icu collation key filter factory test.
public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase {
/*
* Turkish has some funny casing.
* This test shows how you can solve this kind of thing easily with collation.
* Instead of using LowerCaseFilter, use a turkish collator with primary strength.
* Then things will sort and match correctly.
*/
@Test
public void testBasicUsage() throws Exception {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "tr")
.put("index.analysis.filter.myCollator.strength", "primary")
.build();
AnalysisService analysisService = createAnalysisService(settings);
String turkishUpperCase = "I WİLL USE TURKİSH CASING";
String turkishLowerCase = "ı will use turkish casıng";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Test usage of the decomposition option for unicode normalization.
*/
@Test
public void testNormalization() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "tr")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.decomposition", "canonical")
.build();
AnalysisService analysisService = createAnalysisService(settings);
String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
String turkishLowerCase = "ı will use turkish casıng";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Test secondary strength, for english case is not significant.
*/
@Test
public void testSecondaryStrength() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "secondary")
.put("index.analysis.filter.myCollator.decomposition", "no")
.build();
AnalysisService analysisService = createAnalysisService(settings);
String upperCase = "TESTING";
String lowerCase = "testing";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upperCase)));
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Setting alternate=shifted to shift whitespace, punctuation and symbols
* to quaternary level
*/
@Test
public void testIgnorePunctuation() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.alternate", "shifted")
.build();
AnalysisService analysisService = createAnalysisService(settings);
String withPunctuation = "foo-bar";
String withoutPunctuation = "foo bar";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
TokenStream tsWithoutPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withoutPunctuation)));
assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
}
/*
* Setting alternate=shifted and variableTop to shift whitespace, but not
* punctuation or symbols, to quaternary level
*/
@Test
public void testIgnoreWhitespace() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.alternate", "shifted")
.put("index.analysis.filter.myCollator.variableTop", " ")
.build();
AnalysisService analysisService = createAnalysisService(settings);
String withSpace = "foo bar";
String withoutSpace = "foobar";
String withPunctuation = "foo-bar";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
TokenStream tsWithoutSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withoutSpace)));
assertCollatesToSame(tsWithSpace, tsWithoutSpace);
// now assert that punctuation still matters: foo-bar < foo bar
tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
TokenStream tsWithPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
assertCollation(tsWithPunctuation, tsWithSpace, -1);
}
/*
* Setting numeric to encode digits with numeric value, so that
* foobar-9 sorts before foobar-10
*/
@Test
public void testNumerics() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.numeric", "true")
.build();
AnalysisService analysisService = createAnalysisService(settings);
String nine = "foobar-9";
String ten = "foobar-10";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsNine = filterFactory.create(new KeywordTokenizer(new StringReader(nine)));
TokenStream tsTen = filterFactory.create(new KeywordTokenizer(new StringReader(ten)));
assertCollation(tsNine, tsTen, -1);
}
/*
* Setting caseLevel=true to create an additional case level between
* secondary and tertiary
*/
@Test
public void testIgnoreAccentsButNotCase() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.caseLevel", "true")
.build();
AnalysisService analysisService = createAnalysisService(settings);
String withAccents = "résumé";
String withoutAccents = "resume";
String withAccentsUpperCase = "Résumé";
String withoutAccentsUpperCase = "Resume";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsWithAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withAccents)));
TokenStream tsWithoutAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
assertCollatesToSame(tsWithAccents, tsWithoutAccents);
TokenStream tsWithAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
TokenStream tsWithoutAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
// now assert that case still matters: resume < Resume
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
assertCollation(tsLower, tsUpper, -1);
}
/*
* Setting caseFirst=upper to cause uppercase strings to sort
* before lowercase ones.
*/
@Test
public void testUpperCaseFirst() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "tertiary")
.put("index.analysis.filter.myCollator.caseFirst", "upper")
.build();
AnalysisService analysisService = createAnalysisService(settings);
String lower = "resume";
String upper = "Resume";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lower)));
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upper)));
assertCollation(tsUpper, tsLower, -1);
}
/*
* For german, you might want oe to sort and match with o umlaut.
* This is not the default, but you can make a customized ruleset to do this.
*
* The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
*/
@Test
public void testCustomRules() throws Exception {
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
String DIN5007_2_tailorings =
"& ae , a\u0308 & AE , A\u0308"+
"& oe , o\u0308 & OE , O\u0308"+
"& ue , u\u0308 & UE , u\u0308";
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.rules", tailoredRules)
.put("index.analysis.filter.myCollator.strength", "primary")
.build();
AnalysisService analysisService = createAnalysisService(settings);
String germanUmlaut = "Töne";
String germanOE = "Toene";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsUmlaut = filterFactory.create(new KeywordTokenizer(new StringReader(germanUmlaut)));
TokenStream tsOE = filterFactory.create(new KeywordTokenizer(new StringReader(germanOE)));
assertCollatesToSame(tsUmlaut, tsOE);
}
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
assertCollation(stream1, stream2, 0);
}
private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
CharTermAttribute term1 = stream1
.addAttribute(CharTermAttribute.class);
CharTermAttribute term2 = stream2
.addAttribute(CharTermAttribute.class);
stream1.reset();
stream2.reset();
assertThat(stream1.incrementToken(), equalTo(true));
assertThat(stream2.incrementToken(), equalTo(true));
assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison)));
assertThat(stream1.incrementToken(), equalTo(false));
assertThat(stream2.incrementToken(), equalTo(false));
}
}