add back collation (still the way it was working before)

This commit is contained in:
Robert Muir 2014-11-06 03:11:20 -05:00
parent c2c0345837
commit e45308d9e7
8 changed files with 950 additions and 4 deletions

View File

@ -101,6 +101,81 @@ The Following example exempts Swedish characters from the folding. Note that the
} }
``` ```
ICU Collation
-------------
Uses collation token filter. Allows to either specify the rules for collation
(defined [here](http://www.icu-project.org/userguide/Collate_Customization.html)) using the `rules` parameter
(can point to a location or expressed in the settings, location can be relative to config location), or using the
`language` parameter (further specialized by country and variant). By default registers under `icu_collation` or
`icuCollation` and uses the default locale.
Here is a sample settings:
```js
{
"index" : {
"analysis" : {
"analyzer" : {
"collation" : {
"tokenizer" : "keyword",
"filter" : ["icu_collation"]
}
}
}
}
}
```
And here is a sample of custom collation:
```js
{
"index" : {
"analysis" : {
"analyzer" : {
"collation" : {
"tokenizer" : "keyword",
"filter" : ["myCollator"]
}
},
"filter" : {
"myCollator" : {
"type" : "icu_collation",
"language" : "en"
}
}
}
}
}
```
Optional options:
* `strength` - The strength property determines the minimum level of difference considered significant during comparison.
The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
See [ICU Collation](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html) documentation for a more detailed
explanation for the specific values.
* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
faster and more complete collation behavior. Since a great many of the world's languages do not require text
normalization, most locales set `no` as the default decomposition mode.
Expert options:
* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
strength is set to `primary` this will ignore accent differences.
* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
for strength `tertiary`.
* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
* `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
and Hiragana characters in `quaternary` strength .
ICU Tokenizer ICU Tokenizer
------------- -------------

View File

@ -0,0 +1,109 @@
package org.elasticsearch.index.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RawCollationKey;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
/**
* <p>
* Converts each token into its {@link com.ibm.icu.text.CollationKey}, and
* then encodes the CollationKey with {@link IndexableBinaryStringTools}, to
* allow it to be stored as an index term.
* </p>
* <p>
* <strong>WARNING:</strong> Make sure you use exactly the same Collator at
* index and query time -- CollationKeys are only comparable when produced by
* the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are
* independently versioned, so it is safe to search against stored
* CollationKeys if the following are exactly the same (best practice is
* to store this information with the index and check that they remain the
* same at query time):
* </p>
* <ol>
* <li>
* Collator version - see {@link Collator#getVersion()}
* </li>
* <li>
* The collation strength used - see {@link Collator#setStrength(int)}
* </li>
* </ol>
* <p>
* CollationKeys generated by ICU Collators are not compatible with those
* generated by java.text.Collators. Specifically, if you use
* ICUCollationKeyFilter to generate index terms, do not use
* {@code CollationKeyFilter} on the query side, or vice versa.
* </p>
* <p>
* ICUCollationKeyFilter is significantly faster and generates significantly
* shorter keys than CollationKeyFilter. See
* <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
* >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
* @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes
* terms directly as bytes. This filter WAS removed in Lucene 5.0
*/
@Deprecated
public final class ICUCollationKeyFilter extends TokenFilter {
private Collator collator = null;
private RawCollationKey reusableKey = new RawCollationKey();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
*
* @param input Source token stream
* @param collator CollationKey generator
*/
public ICUCollationKeyFilter(TokenStream input, Collator collator) {
super(input);
// clone the collator: see http://userguide.icu-project.org/collation/architecture
try {
this.collator = (Collator) collator.clone();
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
char[] termBuffer = termAtt.buffer();
String termText = new String(termBuffer, 0, termAtt.length());
collator.getRawCollationKey(termText, reusableKey);
int encodedLength = IndexableBinaryStringTools.getEncodedLength(
reusableKey.bytes, 0, reusableKey.size);
if (encodedLength > termBuffer.length) {
termAtt.resizeBuffer(encodedLength);
}
termAtt.setLength(encodedLength);
IndexableBinaryStringTools.encode(reusableKey.bytes, 0, reusableKey.size,
termAtt.buffer(), 0, encodedLength);
return true;
} else {
return false;
}
}
}

View File

@ -173,7 +173,6 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
@Override @Override
public TokenStream create(TokenStream tokenStream) { public TokenStream create(TokenStream tokenStream) {
throw new UnsupportedOperationException("i was deprecated in lucene 4, and now i'm gone"); return new ICUCollationKeyFilter(tokenStream, collator);
// TODO: lucene does sort keys as binary keys since 4.x
} }
} }

View File

@ -0,0 +1,241 @@
package org.elasticsearch.index.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadoc
/**
* Provides support for converting byte sequences to Strings and back again.
* The resulting Strings preserve the original byte sequences' sort order.
* <p/>
* The Strings are constructed using a Base 8000h encoding of the original
* binary data - each char of an encoded String represents a 15-bit chunk
* from the byte sequence. Base 8000h was chosen because it allows for all
* lower 15 bits of char to be used without restriction; the surrogate range
* [U+D8000-U+DFFF] does not represent valid chars, and would require
* complicated handling to avoid them and allow use of char's high bit.
* <p/>
* Although unset bits are used as padding in the final char, the original
* byte sequence could contain trailing bytes with no set bits (null bytes):
* padding is indistinguishable from valid information. To overcome this
* problem, a char is appended, indicating the number of encoded bytes in the
* final content char.
* <p/>
*
* @lucene.experimental
* @deprecated Implement {@link TermToBytesRefAttribute} and store bytes directly
* instead. This class WAS removed in Lucene 5.0
*/
@Deprecated
public final class IndexableBinaryStringTools {
private static final CodingCase[] CODING_CASES = {
// CodingCase(int initialShift, int finalShift)
new CodingCase( 7, 1 ),
// CodingCase(int initialShift, int middleShift, int finalShift)
new CodingCase(14, 6, 2),
new CodingCase(13, 5, 3),
new CodingCase(12, 4, 4),
new CodingCase(11, 3, 5),
new CodingCase(10, 2, 6),
new CodingCase( 9, 1, 7),
new CodingCase( 8, 0 )
};
// Export only static methods
private IndexableBinaryStringTools() {}
/**
* Returns the number of chars required to encode the given bytes.
*
* @param inputArray byte sequence to be encoded
* @param inputOffset initial offset into inputArray
* @param inputLength number of bytes in inputArray
* @return The number of chars required to encode the number of bytes.
*/
public static int getEncodedLength(byte[] inputArray, int inputOffset,
int inputLength) {
// Use long for intermediaries to protect against overflow
return (int)((8L * inputLength + 14L) / 15L) + 1;
}
/**
* Returns the number of bytes required to decode the given char sequence.
*
* @param encoded char sequence to be decoded
* @param offset initial offset
* @param length number of characters
* @return The number of bytes required to decode the given char sequence
*/
public static int getDecodedLength(char[] encoded, int offset, int length) {
final int numChars = length - 1;
if (numChars <= 0) {
return 0;
} else {
// Use long for intermediaries to protect against overflow
final long numFullBytesInFinalChar = encoded[offset + length - 1];
final long numEncodedChars = numChars - 1;
return (int)((numEncodedChars * 15L + 7L) / 8L + numFullBytesInFinalChar);
}
}
/**
* Encodes the input byte sequence into the output char sequence. Before
* calling this method, ensure that the output array has sufficient
* capacity by calling {@link #getEncodedLength(byte[], int, int)}.
*
* @param inputArray byte sequence to be encoded
* @param inputOffset initial offset into inputArray
* @param inputLength number of bytes in inputArray
* @param outputArray char sequence to store encoded result
* @param outputOffset initial offset into outputArray
* @param outputLength length of output, must be getEncodedLength
*/
public static void encode(byte[] inputArray, int inputOffset,
int inputLength, char[] outputArray, int outputOffset, int outputLength) {
assert (outputLength == getEncodedLength(inputArray, inputOffset,
inputLength));
if (inputLength > 0) {
int inputByteNum = inputOffset;
int caseNum = 0;
int outputCharNum = outputOffset;
CodingCase codingCase;
for (; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength; ++outputCharNum) {
codingCase = CODING_CASES[caseNum];
if (2 == codingCase.numBytes) {
outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
+ (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
} else { // numBytes is 3
outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
+ ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)
+ (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
}
inputByteNum += codingCase.advanceBytes;
if (++caseNum == CODING_CASES.length) {
caseNum = 0;
}
}
// Produce final char (if any) and trailing count chars.
codingCase = CODING_CASES[caseNum];
if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3
outputArray[outputCharNum++] = (char) ((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) & (short) 0x7FFF);
// Add trailing char containing the number of full bytes in final char
outputArray[outputCharNum++] = (char) 1;
} else if (inputByteNum < inputLength) {
outputArray[outputCharNum++] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) & (short) 0x7FFF);
// Add trailing char containing the number of full bytes in final char
outputArray[outputCharNum++] = caseNum == 0 ? (char) 1 : (char) 0;
} else { // No left over bits - last char is completely filled.
// Add trailing char containing the number of full bytes in final char
outputArray[outputCharNum++] = (char) 1;
}
}
}
/**
* Decodes the input char sequence into the output byte sequence. Before
* calling this method, ensure that the output array has sufficient capacity
* by calling {@link #getDecodedLength(char[], int, int)}.
*
* @param inputArray char sequence to be decoded
* @param inputOffset initial offset into inputArray
* @param inputLength number of chars in inputArray
* @param outputArray byte sequence to store encoded result
* @param outputOffset initial offset into outputArray
* @param outputLength length of output, must be
* getDecodedLength(inputArray, inputOffset, inputLength)
*/
public static void decode(char[] inputArray, int inputOffset,
int inputLength, byte[] outputArray, int outputOffset, int outputLength) {
assert (outputLength == getDecodedLength(inputArray, inputOffset,
inputLength));
final int numInputChars = inputLength - 1;
final int numOutputBytes = outputLength;
if (numOutputBytes > 0) {
int caseNum = 0;
int outputByteNum = outputOffset;
int inputCharNum = inputOffset;
short inputChar;
CodingCase codingCase;
for (; inputCharNum < numInputChars - 1; ++inputCharNum) {
codingCase = CODING_CASES[caseNum];
inputChar = (short) inputArray[inputCharNum];
if (2 == codingCase.numBytes) {
if (0 == caseNum) {
outputArray[outputByteNum] = (byte) (inputChar >>> codingCase.initialShift);
} else {
outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
}
outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
} else { // numBytes is 3
outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
}
outputByteNum += codingCase.advanceBytes;
if (++caseNum == CODING_CASES.length) {
caseNum = 0;
}
}
// Handle final char
inputChar = (short) inputArray[inputCharNum];
codingCase = CODING_CASES[caseNum];
if (0 == caseNum) {
outputArray[outputByteNum] = 0;
}
outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
final int bytesLeft = numOutputBytes - outputByteNum;
if (bytesLeft > 1) {
if (2 == codingCase.numBytes) {
outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) >>> codingCase.finalShift);
} else { // numBytes is 3
outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
if (bytesLeft > 2) {
outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
}
}
}
}
}
static class CodingCase {
int numBytes, initialShift, middleShift, finalShift, advanceBytes = 2;
short middleMask, finalMask;
CodingCase(int initialShift, int middleShift, int finalShift) {
this.numBytes = 3;
this.initialShift = initialShift;
this.middleShift = middleShift;
this.finalShift = finalShift;
this.finalMask = (short)((short)0xFF >>> finalShift);
this.middleMask = (short)((short)0xFF << middleShift);
}
CodingCase(int initialShift, int finalShift) {
this.numBytes = 2;
this.initialShift = initialShift;
this.finalShift = finalShift;
this.finalMask = (short)((short)0xFF >>> finalShift);
if (finalShift != 0) {
advanceBytes = 1;
}
}
}
}

View File

@ -19,6 +19,7 @@
package org.elasticsearch.indices.analysis; package org.elasticsearch.indices.analysis;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.Transliterator; import com.ibm.icu.text.Transliterator;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -29,6 +30,7 @@ import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.analysis.ICUCollationKeyFilter;
import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory; import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory;
import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory; import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory;
@ -81,6 +83,18 @@ public class IcuIndicesAnalysis extends AbstractComponent {
} }
})); }));
indicesAnalysisService.tokenFilterFactories().put("icu_collation", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
@Override
public String name() {
return "icu_collation";
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new ICUCollationKeyFilter(tokenStream, Collator.getInstance());
}
}));
indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
@Override @Override
public String name() { public String name() {

View File

@ -52,8 +52,10 @@ public class ICUIntegrationTests extends ElasticsearchIntegrationTest {
Settings settings = ImmutableSettings.builder() Settings settings = ImmutableSettings.builder()
.put(super.indexSettings()) .put(super.indexSettings())
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
.putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "my_folding") .putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_collator")
.put("index.analysis.filter.my_folding.type", "icu_folding") .put("index.analysis.filter.my_collator.type", "icu_collation")
.put("index.analysis.filter.my_collator.language", "en")
.put("index.analysis.filter.my_collator.strength", "primary")
.build(); .build();
return settings; return settings;

View File

@ -0,0 +1,255 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.test.ElasticsearchTestCase;
import org.junit.Test;
import java.io.IOException;
import java.io.StringReader;
import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService;
import static org.hamcrest.Matchers.equalTo;
// Tests borrowed from Solr's Icu collation key filter factory test.
public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase {
/*
* Turkish has some funny casing.
* This test shows how you can solve this kind of thing easily with collation.
* Instead of using LowerCaseFilter, use a turkish collator with primary strength.
* Then things will sort and match correctly.
*/
@Test
public void testBasicUsage() throws Exception {
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "tr")
.put("index.analysis.filter.myCollator.strength", "primary")
.build();
AnalysisService analysisService = createAnalysisService(settings);
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
assertCollatesToSame(filterFactory, "I WİLL USE TURKİSH CASING", "ı will use turkish casıng");
}
/*
* Test usage of the decomposition option for unicode normalization.
*/
@Test
public void testNormalization() throws IOException {
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "tr")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.decomposition", "canonical")
.build();
AnalysisService analysisService = createAnalysisService(settings);
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
assertCollatesToSame(filterFactory, "I W\u0049\u0307LL USE TURKİSH CASING", "ı will use turkish casıng");
}
/*
* Test secondary strength, for english case is not significant.
*/
@Test
public void testSecondaryStrength() throws IOException {
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "secondary")
.put("index.analysis.filter.myCollator.decomposition", "no")
.build();
AnalysisService analysisService = createAnalysisService(settings);
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
assertCollatesToSame(filterFactory, "TESTING", "testing");
}
/*
* Setting alternate=shifted to shift whitespace, punctuation and symbols
* to quaternary level
*/
@Test
public void testIgnorePunctuation() throws IOException {
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.alternate", "shifted")
.build();
AnalysisService analysisService = createAnalysisService(settings);
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
assertCollatesToSame(filterFactory, "foo-bar", "foo bar");
}
/*
* Setting alternate=shifted and variableTop to shift whitespace, but not
* punctuation or symbols, to quaternary level
*/
@Test
public void testIgnoreWhitespace() throws IOException {
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.alternate", "shifted")
.put("index.analysis.filter.myCollator.variableTop", " ")
.build();
AnalysisService analysisService = createAnalysisService(settings);
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
assertCollatesToSame(filterFactory, "foo bar", "foobar");
// now assert that punctuation still matters: foo-bar < foo bar
assertCollation(filterFactory, "foo-bar", "foo bar", -1);
}
/*
* Setting numeric to encode digits with numeric value, so that
* foobar-9 sorts before foobar-10
*/
@Test
public void testNumerics() throws IOException {
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.numeric", "true")
.build();
AnalysisService analysisService = createAnalysisService(settings);
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
assertCollation(filterFactory, "foobar-9", "foobar-10", -1);
}
/*
* Setting caseLevel=true to create an additional case level between
* secondary and tertiary
*/
@Test
public void testIgnoreAccentsButNotCase() throws IOException {
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.caseLevel", "true")
.build();
AnalysisService analysisService = createAnalysisService(settings);
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
assertCollatesToSame(filterFactory, "résumé", "resume");
assertCollatesToSame(filterFactory, "Résumé", "Resume");
// now assert that case still matters: resume < Resume
assertCollation(filterFactory, "resume", "Resume", -1);
}
/*
* Setting caseFirst=upper to cause uppercase strings to sort
* before lowercase ones.
*/
@Test
public void testUpperCaseFirst() throws IOException {
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "tertiary")
.put("index.analysis.filter.myCollator.caseFirst", "upper")
.build();
AnalysisService analysisService = createAnalysisService(settings);
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
assertCollation(filterFactory, "Resume", "resume", -1);
}
/*
* For german, you might want oe to sort and match with o umlaut.
* This is not the default, but you can make a customized ruleset to do this.
*
* The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
*/
@Test
public void testCustomRules() throws Exception {
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
String DIN5007_2_tailorings =
"& ae , a\u0308 & AE , A\u0308"+
"& oe , o\u0308 & OE , O\u0308"+
"& ue , u\u0308 & UE , u\u0308";
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.rules", tailoredRules)
.put("index.analysis.filter.myCollator.strength", "primary")
.build();
AnalysisService analysisService = createAnalysisService(settings);
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
assertCollatesToSame(filterFactory, "Töne", "Toene");
}
private void assertCollatesToSame(TokenFilterFactory factory, String string1, String string2) throws IOException {
assertCollation(factory, string1, string2, 0);
}
private void assertCollation(TokenFilterFactory factory, String string1, String string2, int comparison) throws IOException {
Tokenizer tokenizer = new KeywordTokenizer();
tokenizer.setReader(new StringReader(string1));
TokenStream stream1 = factory.create(tokenizer);
tokenizer = new KeywordTokenizer();
tokenizer.setReader(new StringReader(string2));
TokenStream stream2 = factory.create(tokenizer);
assertCollation(stream1, stream2, comparison);
}
private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);
stream1.reset();
stream2.reset();
assertThat(stream1.incrementToken(), equalTo(true));
assertThat(stream2.incrementToken(), equalTo(true));
assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison)));
assertThat(stream1.incrementToken(), equalTo(false));
assertThat(stream2.incrementToken(), equalTo(false));
stream1.end();
stream2.end();
stream1.close();
stream2.close();
}
}

View File

@ -0,0 +1,251 @@
package org.elasticsearch.index.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.TimeUnits;
import org.elasticsearch.test.ElasticsearchThreadFilter;
import org.elasticsearch.test.junit.listeners.ReproduceInfoPrinter;
import org.junit.BeforeClass;
import com.carrotsearch.randomizedtesting.annotations.Listeners;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope;
import java.util.Locale;
/**
* @deprecated Remove when IndexableBinaryStringTools is removed.
*/
@Deprecated
@Listeners({
ReproduceInfoPrinter.class
})
@ThreadLeakFilters(defaultFilters = true, filters = {ElasticsearchThreadFilter.class})
@ThreadLeakScope(Scope.NONE)
@TimeoutSuite(millis = TimeUnits.HOUR)
@LuceneTestCase.SuppressSysoutChecks(bugUrl = "we log a lot on purpose")
public class TestIndexableBinaryStringTools extends LuceneTestCase {
private static int NUM_RANDOM_TESTS;
private static int MAX_RANDOM_BINARY_LENGTH;
@BeforeClass
public static void beforeClass() throws Exception {
NUM_RANDOM_TESTS = atLeast(200);
MAX_RANDOM_BINARY_LENGTH = atLeast(300);
}
public void testSingleBinaryRoundTrip() {
byte[] binary = new byte[] { (byte) 0x23, (byte) 0x98, (byte) 0x13,
(byte) 0xE4, (byte) 0x76, (byte) 0x41, (byte) 0xB2, (byte) 0xC9,
(byte) 0x7F, (byte) 0x0A, (byte) 0xA6, (byte) 0xD8 };
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
binary.length);
char encoded[] = new char[encodedLen];
IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
encoded.length);
int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
encoded.length);
byte decoded[] = new byte[decodedLen];
IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
decoded.length);
assertEquals("Round trip decode/decode returned different results:"
+ System.getProperty("line.separator") + "original: "
+ binaryDump(binary, binary.length)
+ System.getProperty("line.separator") + " encoded: "
+ charArrayDump(encoded, encoded.length)
+ System.getProperty("line.separator") + " decoded: "
+ binaryDump(decoded, decoded.length),
binaryDump(binary, binary.length), binaryDump(decoded, decoded.length));
}
public void testEncodedSortability() {
byte[] originalArray1 = new byte[MAX_RANDOM_BINARY_LENGTH];
char[] originalString1 = new char[MAX_RANDOM_BINARY_LENGTH];
char[] encoded1 = new char[MAX_RANDOM_BINARY_LENGTH * 10];
byte[] original2 = new byte[MAX_RANDOM_BINARY_LENGTH];
char[] originalString2 = new char[MAX_RANDOM_BINARY_LENGTH];
char[] encoded2 = new char[MAX_RANDOM_BINARY_LENGTH * 10];
for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) {
int numBytes1 = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
for (int byteNum = 0; byteNum < numBytes1; ++byteNum) {
int randomInt = random().nextInt(0x100);
originalArray1[byteNum] = (byte) randomInt;
originalString1[byteNum] = (char) randomInt;
}
int numBytes2 = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
for (int byteNum = 0; byteNum < numBytes2; ++byteNum) {
int randomInt = random().nextInt(0x100);
original2[byteNum] = (byte) randomInt;
originalString2[byteNum] = (char) randomInt;
}
int originalComparison = new String(originalString1, 0, numBytes1)
.compareTo(new String(originalString2, 0, numBytes2));
originalComparison = originalComparison < 0 ? -1
: originalComparison > 0 ? 1 : 0;
int encodedLen1 = IndexableBinaryStringTools.getEncodedLength(
originalArray1, 0, numBytes1);
if (encodedLen1 > encoded1.length)
encoded1 = new char[ArrayUtil.oversize(encodedLen1, RamUsageEstimator.NUM_BYTES_CHAR)];
IndexableBinaryStringTools.encode(originalArray1, 0, numBytes1, encoded1,
0, encodedLen1);
int encodedLen2 = IndexableBinaryStringTools.getEncodedLength(original2,
0, numBytes2);
if (encodedLen2 > encoded2.length)
encoded2 = new char[ArrayUtil.oversize(encodedLen2, RamUsageEstimator.NUM_BYTES_CHAR)];
IndexableBinaryStringTools.encode(original2, 0, numBytes2, encoded2, 0,
encodedLen2);
int encodedComparison = new String(encoded1, 0, encodedLen1)
.compareTo(new String(encoded2, 0, encodedLen2));
encodedComparison = encodedComparison < 0 ? -1
: encodedComparison > 0 ? 1 : 0;
assertEquals("Test #" + (testNum + 1)
+ ": Original bytes and encoded chars compare differently:"
+ System.getProperty("line.separator") + " binary 1: "
+ binaryDump(originalArray1, numBytes1)
+ System.getProperty("line.separator") + " binary 2: "
+ binaryDump(original2, numBytes2)
+ System.getProperty("line.separator") + "encoded 1: "
+ charArrayDump(encoded1, encodedLen1)
+ System.getProperty("line.separator") + "encoded 2: "
+ charArrayDump(encoded2, encodedLen2)
+ System.getProperty("line.separator"), originalComparison,
encodedComparison);
}
}
public void testEmptyInput() {
byte[] binary = new byte[0];
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
binary.length);
char[] encoded = new char[encodedLen];
IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
encoded.length);
int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
encoded.length);
byte[] decoded = new byte[decodedLen];
IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
decoded.length);
assertEquals("decoded empty input was not empty", decoded.length, 0);
}
public void testAllNullInput() {
byte[] binary = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
binary.length);
char encoded[] = new char[encodedLen];
IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
encoded.length);
int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
encoded.length);
byte[] decoded = new byte[decodedLen];
IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
decoded.length);
assertEquals("Round trip decode/decode returned different results:"
+ System.getProperty("line.separator") + " original: "
+ binaryDump(binary, binary.length)
+ System.getProperty("line.separator") + "decodedBuf: "
+ binaryDump(decoded, decoded.length),
binaryDump(binary, binary.length), binaryDump(decoded, decoded.length));
}
public void testRandomBinaryRoundTrip() {
byte[] binary = new byte[MAX_RANDOM_BINARY_LENGTH];
char[] encoded = new char[MAX_RANDOM_BINARY_LENGTH * 10];
byte[] decoded = new byte[MAX_RANDOM_BINARY_LENGTH];
for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) {
int numBytes = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
for (int byteNum = 0; byteNum < numBytes; ++byteNum) {
binary[byteNum] = (byte) random().nextInt(0x100);
}
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
numBytes);
if (encoded.length < encodedLen)
encoded = new char[ArrayUtil.oversize(encodedLen, RamUsageEstimator.NUM_BYTES_CHAR)];
IndexableBinaryStringTools.encode(binary, 0, numBytes, encoded, 0,
encodedLen);
int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
encodedLen);
IndexableBinaryStringTools.decode(encoded, 0, encodedLen, decoded, 0,
decodedLen);
assertEquals("Test #" + (testNum + 1)
+ ": Round trip decode/decode returned different results:"
+ System.getProperty("line.separator") + " original: "
+ binaryDump(binary, numBytes) + System.getProperty("line.separator")
+ "encodedBuf: " + charArrayDump(encoded, encodedLen)
+ System.getProperty("line.separator") + "decodedBuf: "
+ binaryDump(decoded, decodedLen), binaryDump(binary, numBytes),
binaryDump(decoded, decodedLen));
}
}
public String binaryDump(byte[] binary, int numBytes) {
StringBuilder buf = new StringBuilder();
for (int byteNum = 0 ; byteNum < numBytes ; ++byteNum) {
String hex = Integer.toHexString(binary[byteNum] & 0xFF);
if (hex.length() == 1) {
buf.append('0');
}
buf.append(hex.toUpperCase(Locale.ROOT));
if (byteNum < numBytes - 1) {
buf.append(' ');
}
}
return buf.toString();
}
public String charArrayDump(char[] charArray, int numBytes) {
StringBuilder buf = new StringBuilder();
for (int charNum = 0 ; charNum < numBytes ; ++charNum) {
String hex = Integer.toHexString(charArray[charNum]);
for (int digit = 0 ; digit < 4 - hex.length() ; ++digit) {
buf.append('0');
}
buf.append(hex.toUpperCase(Locale.ROOT));
if (charNum < numBytes - 1) {
buf.append(' ');
}
}
return buf.toString();
}
}