add back collation (still the way it was working before)
This commit is contained in:
parent
c2c0345837
commit
e45308d9e7
75
README.md
75
README.md
|
@ -101,6 +101,81 @@ The Following example exempts Swedish characters from the folding. Note that the
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
ICU Collation
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Uses collation token filter. Allows to either specify the rules for collation
|
||||||
|
(defined [here](http://www.icu-project.org/userguide/Collate_Customization.html)) using the `rules` parameter
|
||||||
|
(can point to a location or expressed in the settings, location can be relative to config location), or using the
|
||||||
|
`language` parameter (further specialized by country and variant). By default registers under `icu_collation` or
|
||||||
|
`icuCollation` and uses the default locale.
|
||||||
|
|
||||||
|
Here is a sample settings:
|
||||||
|
|
||||||
|
```js
|
||||||
|
{
|
||||||
|
"index" : {
|
||||||
|
"analysis" : {
|
||||||
|
"analyzer" : {
|
||||||
|
"collation" : {
|
||||||
|
"tokenizer" : "keyword",
|
||||||
|
"filter" : ["icu_collation"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
And here is a sample of custom collation:
|
||||||
|
|
||||||
|
```js
|
||||||
|
{
|
||||||
|
"index" : {
|
||||||
|
"analysis" : {
|
||||||
|
"analyzer" : {
|
||||||
|
"collation" : {
|
||||||
|
"tokenizer" : "keyword",
|
||||||
|
"filter" : ["myCollator"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"filter" : {
|
||||||
|
"myCollator" : {
|
||||||
|
"type" : "icu_collation",
|
||||||
|
"language" : "en"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Optional options:
|
||||||
|
* `strength` - The strength property determines the minimum level of difference considered significant during comparison.
|
||||||
|
The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
|
||||||
|
Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
|
||||||
|
See [ICU Collation](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html) documentation for a more detailed
|
||||||
|
explanation for the specific values.
|
||||||
|
* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
|
||||||
|
`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
|
||||||
|
normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
|
||||||
|
before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
|
||||||
|
faster and more complete collation behavior. Since a great many of the world's languages do not require text
|
||||||
|
normalization, most locales set `no` as the default decomposition mode.
|
||||||
|
|
||||||
|
Expert options:
|
||||||
|
* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
|
||||||
|
to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
|
||||||
|
* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
|
||||||
|
strength is set to `primary` this will ignore accent differences.
|
||||||
|
* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
|
||||||
|
for strength `tertiary`.
|
||||||
|
* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
|
||||||
|
example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
|
||||||
|
* `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
|
||||||
|
* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
|
||||||
|
and Hiragana characters in `quaternary` strength .
|
||||||
|
|
||||||
ICU Tokenizer
|
ICU Tokenizer
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,109 @@
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import com.ibm.icu.text.Collator;
|
||||||
|
import com.ibm.icu.text.RawCollationKey;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* Converts each token into its {@link com.ibm.icu.text.CollationKey}, and
|
||||||
|
* then encodes the CollationKey with {@link IndexableBinaryStringTools}, to
|
||||||
|
* allow it to be stored as an index term.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* <strong>WARNING:</strong> Make sure you use exactly the same Collator at
|
||||||
|
* index and query time -- CollationKeys are only comparable when produced by
|
||||||
|
* the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are
|
||||||
|
* independently versioned, so it is safe to search against stored
|
||||||
|
* CollationKeys if the following are exactly the same (best practice is
|
||||||
|
* to store this information with the index and check that they remain the
|
||||||
|
* same at query time):
|
||||||
|
* </p>
|
||||||
|
* <ol>
|
||||||
|
* <li>
|
||||||
|
* Collator version - see {@link Collator#getVersion()}
|
||||||
|
* </li>
|
||||||
|
* <li>
|
||||||
|
* The collation strength used - see {@link Collator#setStrength(int)}
|
||||||
|
* </li>
|
||||||
|
* </ol>
|
||||||
|
* <p>
|
||||||
|
* CollationKeys generated by ICU Collators are not compatible with those
|
||||||
|
* generated by java.text.Collators. Specifically, if you use
|
||||||
|
* ICUCollationKeyFilter to generate index terms, do not use
|
||||||
|
* {@code CollationKeyFilter} on the query side, or vice versa.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* ICUCollationKeyFilter is significantly faster and generates significantly
|
||||||
|
* shorter keys than CollationKeyFilter. See
|
||||||
|
* <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
|
||||||
|
* >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
|
||||||
|
* generation timing and key length comparisons between ICU4J and
|
||||||
|
* java.text.Collator over several languages.
|
||||||
|
* </p>
|
||||||
|
* @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes
|
||||||
|
* terms directly as bytes. This filter WAS removed in Lucene 5.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public final class ICUCollationKeyFilter extends TokenFilter {
|
||||||
|
private Collator collator = null;
|
||||||
|
private RawCollationKey reusableKey = new RawCollationKey();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param input Source token stream
|
||||||
|
* @param collator CollationKey generator
|
||||||
|
*/
|
||||||
|
public ICUCollationKeyFilter(TokenStream input, Collator collator) {
|
||||||
|
super(input);
|
||||||
|
// clone the collator: see http://userguide.icu-project.org/collation/architecture
|
||||||
|
try {
|
||||||
|
this.collator = (Collator) collator.clone();
|
||||||
|
} catch (CloneNotSupportedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
char[] termBuffer = termAtt.buffer();
|
||||||
|
String termText = new String(termBuffer, 0, termAtt.length());
|
||||||
|
collator.getRawCollationKey(termText, reusableKey);
|
||||||
|
int encodedLength = IndexableBinaryStringTools.getEncodedLength(
|
||||||
|
reusableKey.bytes, 0, reusableKey.size);
|
||||||
|
if (encodedLength > termBuffer.length) {
|
||||||
|
termAtt.resizeBuffer(encodedLength);
|
||||||
|
}
|
||||||
|
termAtt.setLength(encodedLength);
|
||||||
|
IndexableBinaryStringTools.encode(reusableKey.bytes, 0, reusableKey.size,
|
||||||
|
termAtt.buffer(), 0, encodedLength);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -173,7 +173,6 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
throw new UnsupportedOperationException("i was deprecated in lucene 4, and now i'm gone");
|
return new ICUCollationKeyFilter(tokenStream, collator);
|
||||||
// TODO: lucene does sort keys as binary keys since 4.x
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,241 @@
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadoc
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides support for converting byte sequences to Strings and back again.
|
||||||
|
* The resulting Strings preserve the original byte sequences' sort order.
|
||||||
|
* <p/>
|
||||||
|
* The Strings are constructed using a Base 8000h encoding of the original
|
||||||
|
* binary data - each char of an encoded String represents a 15-bit chunk
|
||||||
|
* from the byte sequence. Base 8000h was chosen because it allows for all
|
||||||
|
* lower 15 bits of char to be used without restriction; the surrogate range
|
||||||
|
* [U+D8000-U+DFFF] does not represent valid chars, and would require
|
||||||
|
* complicated handling to avoid them and allow use of char's high bit.
|
||||||
|
* <p/>
|
||||||
|
* Although unset bits are used as padding in the final char, the original
|
||||||
|
* byte sequence could contain trailing bytes with no set bits (null bytes):
|
||||||
|
* padding is indistinguishable from valid information. To overcome this
|
||||||
|
* problem, a char is appended, indicating the number of encoded bytes in the
|
||||||
|
* final content char.
|
||||||
|
* <p/>
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
* @deprecated Implement {@link TermToBytesRefAttribute} and store bytes directly
|
||||||
|
* instead. This class WAS removed in Lucene 5.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public final class IndexableBinaryStringTools {
|
||||||
|
|
||||||
|
private static final CodingCase[] CODING_CASES = {
|
||||||
|
// CodingCase(int initialShift, int finalShift)
|
||||||
|
new CodingCase( 7, 1 ),
|
||||||
|
// CodingCase(int initialShift, int middleShift, int finalShift)
|
||||||
|
new CodingCase(14, 6, 2),
|
||||||
|
new CodingCase(13, 5, 3),
|
||||||
|
new CodingCase(12, 4, 4),
|
||||||
|
new CodingCase(11, 3, 5),
|
||||||
|
new CodingCase(10, 2, 6),
|
||||||
|
new CodingCase( 9, 1, 7),
|
||||||
|
new CodingCase( 8, 0 )
|
||||||
|
};
|
||||||
|
|
||||||
|
// Export only static methods
|
||||||
|
private IndexableBinaryStringTools() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of chars required to encode the given bytes.
|
||||||
|
*
|
||||||
|
* @param inputArray byte sequence to be encoded
|
||||||
|
* @param inputOffset initial offset into inputArray
|
||||||
|
* @param inputLength number of bytes in inputArray
|
||||||
|
* @return The number of chars required to encode the number of bytes.
|
||||||
|
*/
|
||||||
|
public static int getEncodedLength(byte[] inputArray, int inputOffset,
|
||||||
|
int inputLength) {
|
||||||
|
// Use long for intermediaries to protect against overflow
|
||||||
|
return (int)((8L * inputLength + 14L) / 15L) + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of bytes required to decode the given char sequence.
|
||||||
|
*
|
||||||
|
* @param encoded char sequence to be decoded
|
||||||
|
* @param offset initial offset
|
||||||
|
* @param length number of characters
|
||||||
|
* @return The number of bytes required to decode the given char sequence
|
||||||
|
*/
|
||||||
|
public static int getDecodedLength(char[] encoded, int offset, int length) {
|
||||||
|
final int numChars = length - 1;
|
||||||
|
if (numChars <= 0) {
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
// Use long for intermediaries to protect against overflow
|
||||||
|
final long numFullBytesInFinalChar = encoded[offset + length - 1];
|
||||||
|
final long numEncodedChars = numChars - 1;
|
||||||
|
return (int)((numEncodedChars * 15L + 7L) / 8L + numFullBytesInFinalChar);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encodes the input byte sequence into the output char sequence. Before
|
||||||
|
* calling this method, ensure that the output array has sufficient
|
||||||
|
* capacity by calling {@link #getEncodedLength(byte[], int, int)}.
|
||||||
|
*
|
||||||
|
* @param inputArray byte sequence to be encoded
|
||||||
|
* @param inputOffset initial offset into inputArray
|
||||||
|
* @param inputLength number of bytes in inputArray
|
||||||
|
* @param outputArray char sequence to store encoded result
|
||||||
|
* @param outputOffset initial offset into outputArray
|
||||||
|
* @param outputLength length of output, must be getEncodedLength
|
||||||
|
*/
|
||||||
|
public static void encode(byte[] inputArray, int inputOffset,
|
||||||
|
int inputLength, char[] outputArray, int outputOffset, int outputLength) {
|
||||||
|
assert (outputLength == getEncodedLength(inputArray, inputOffset,
|
||||||
|
inputLength));
|
||||||
|
if (inputLength > 0) {
|
||||||
|
int inputByteNum = inputOffset;
|
||||||
|
int caseNum = 0;
|
||||||
|
int outputCharNum = outputOffset;
|
||||||
|
CodingCase codingCase;
|
||||||
|
for (; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength; ++outputCharNum) {
|
||||||
|
codingCase = CODING_CASES[caseNum];
|
||||||
|
if (2 == codingCase.numBytes) {
|
||||||
|
outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
|
||||||
|
+ (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
|
||||||
|
} else { // numBytes is 3
|
||||||
|
outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
|
||||||
|
+ ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)
|
||||||
|
+ (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
|
||||||
|
}
|
||||||
|
inputByteNum += codingCase.advanceBytes;
|
||||||
|
if (++caseNum == CODING_CASES.length) {
|
||||||
|
caseNum = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Produce final char (if any) and trailing count chars.
|
||||||
|
codingCase = CODING_CASES[caseNum];
|
||||||
|
|
||||||
|
if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3
|
||||||
|
outputArray[outputCharNum++] = (char) ((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) & (short) 0x7FFF);
|
||||||
|
// Add trailing char containing the number of full bytes in final char
|
||||||
|
outputArray[outputCharNum++] = (char) 1;
|
||||||
|
} else if (inputByteNum < inputLength) {
|
||||||
|
outputArray[outputCharNum++] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) & (short) 0x7FFF);
|
||||||
|
// Add trailing char containing the number of full bytes in final char
|
||||||
|
outputArray[outputCharNum++] = caseNum == 0 ? (char) 1 : (char) 0;
|
||||||
|
} else { // No left over bits - last char is completely filled.
|
||||||
|
// Add trailing char containing the number of full bytes in final char
|
||||||
|
outputArray[outputCharNum++] = (char) 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decodes the input char sequence into the output byte sequence. Before
|
||||||
|
* calling this method, ensure that the output array has sufficient capacity
|
||||||
|
* by calling {@link #getDecodedLength(char[], int, int)}.
|
||||||
|
*
|
||||||
|
* @param inputArray char sequence to be decoded
|
||||||
|
* @param inputOffset initial offset into inputArray
|
||||||
|
* @param inputLength number of chars in inputArray
|
||||||
|
* @param outputArray byte sequence to store encoded result
|
||||||
|
* @param outputOffset initial offset into outputArray
|
||||||
|
* @param outputLength length of output, must be
|
||||||
|
* getDecodedLength(inputArray, inputOffset, inputLength)
|
||||||
|
*/
|
||||||
|
public static void decode(char[] inputArray, int inputOffset,
|
||||||
|
int inputLength, byte[] outputArray, int outputOffset, int outputLength) {
|
||||||
|
assert (outputLength == getDecodedLength(inputArray, inputOffset,
|
||||||
|
inputLength));
|
||||||
|
final int numInputChars = inputLength - 1;
|
||||||
|
final int numOutputBytes = outputLength;
|
||||||
|
|
||||||
|
if (numOutputBytes > 0) {
|
||||||
|
int caseNum = 0;
|
||||||
|
int outputByteNum = outputOffset;
|
||||||
|
int inputCharNum = inputOffset;
|
||||||
|
short inputChar;
|
||||||
|
CodingCase codingCase;
|
||||||
|
for (; inputCharNum < numInputChars - 1; ++inputCharNum) {
|
||||||
|
codingCase = CODING_CASES[caseNum];
|
||||||
|
inputChar = (short) inputArray[inputCharNum];
|
||||||
|
if (2 == codingCase.numBytes) {
|
||||||
|
if (0 == caseNum) {
|
||||||
|
outputArray[outputByteNum] = (byte) (inputChar >>> codingCase.initialShift);
|
||||||
|
} else {
|
||||||
|
outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
|
||||||
|
}
|
||||||
|
outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
|
||||||
|
} else { // numBytes is 3
|
||||||
|
outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
|
||||||
|
outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
|
||||||
|
outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
|
||||||
|
}
|
||||||
|
outputByteNum += codingCase.advanceBytes;
|
||||||
|
if (++caseNum == CODING_CASES.length) {
|
||||||
|
caseNum = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Handle final char
|
||||||
|
inputChar = (short) inputArray[inputCharNum];
|
||||||
|
codingCase = CODING_CASES[caseNum];
|
||||||
|
if (0 == caseNum) {
|
||||||
|
outputArray[outputByteNum] = 0;
|
||||||
|
}
|
||||||
|
outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
|
||||||
|
final int bytesLeft = numOutputBytes - outputByteNum;
|
||||||
|
if (bytesLeft > 1) {
|
||||||
|
if (2 == codingCase.numBytes) {
|
||||||
|
outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) >>> codingCase.finalShift);
|
||||||
|
} else { // numBytes is 3
|
||||||
|
outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
|
||||||
|
if (bytesLeft > 2) {
|
||||||
|
outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class CodingCase {
|
||||||
|
int numBytes, initialShift, middleShift, finalShift, advanceBytes = 2;
|
||||||
|
short middleMask, finalMask;
|
||||||
|
|
||||||
|
CodingCase(int initialShift, int middleShift, int finalShift) {
|
||||||
|
this.numBytes = 3;
|
||||||
|
this.initialShift = initialShift;
|
||||||
|
this.middleShift = middleShift;
|
||||||
|
this.finalShift = finalShift;
|
||||||
|
this.finalMask = (short)((short)0xFF >>> finalShift);
|
||||||
|
this.middleMask = (short)((short)0xFF << middleShift);
|
||||||
|
}
|
||||||
|
|
||||||
|
CodingCase(int initialShift, int finalShift) {
|
||||||
|
this.numBytes = 2;
|
||||||
|
this.initialShift = initialShift;
|
||||||
|
this.finalShift = finalShift;
|
||||||
|
this.finalMask = (short)((short)0xFF >>> finalShift);
|
||||||
|
if (finalShift != 0) {
|
||||||
|
advanceBytes = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -19,6 +19,7 @@
|
||||||
|
|
||||||
package org.elasticsearch.indices.analysis;
|
package org.elasticsearch.indices.analysis;
|
||||||
|
|
||||||
|
import com.ibm.icu.text.Collator;
|
||||||
import com.ibm.icu.text.Normalizer2;
|
import com.ibm.icu.text.Normalizer2;
|
||||||
import com.ibm.icu.text.Transliterator;
|
import com.ibm.icu.text.Transliterator;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -29,6 +30,7 @@ import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
|
||||||
import org.elasticsearch.common.component.AbstractComponent;
|
import org.elasticsearch.common.component.AbstractComponent;
|
||||||
import org.elasticsearch.common.inject.Inject;
|
import org.elasticsearch.common.inject.Inject;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.index.analysis.ICUCollationKeyFilter;
|
||||||
import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory;
|
import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory;
|
||||||
import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
|
import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
|
||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
|
@ -81,6 +83,18 @@ public class IcuIndicesAnalysis extends AbstractComponent {
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
indicesAnalysisService.tokenFilterFactories().put("icu_collation", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return "icu_collation";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new ICUCollationKeyFilter(tokenStream, Collator.getInstance());
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
@Override
|
@Override
|
||||||
public String name() {
|
public String name() {
|
||||||
|
|
|
@ -52,8 +52,10 @@ public class ICUIntegrationTests extends ElasticsearchIntegrationTest {
|
||||||
Settings settings = ImmutableSettings.builder()
|
Settings settings = ImmutableSettings.builder()
|
||||||
.put(super.indexSettings())
|
.put(super.indexSettings())
|
||||||
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
|
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
|
||||||
.putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "my_folding")
|
.putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_collator")
|
||||||
.put("index.analysis.filter.my_folding.type", "icu_folding")
|
.put("index.analysis.filter.my_collator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.my_collator.language", "en")
|
||||||
|
.put("index.analysis.filter.my_collator.strength", "primary")
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
return settings;
|
return settings;
|
||||||
|
|
|
@ -0,0 +1,255 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import com.ibm.icu.text.Collator;
|
||||||
|
import com.ibm.icu.text.RuleBasedCollator;
|
||||||
|
import com.ibm.icu.util.ULocale;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService;
|
||||||
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
|
||||||
|
// Tests borrowed from Solr's Icu collation key filter factory test.
|
||||||
|
public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Turkish has some funny casing.
|
||||||
|
* This test shows how you can solve this kind of thing easily with collation.
|
||||||
|
* Instead of using LowerCaseFilter, use a turkish collator with primary strength.
|
||||||
|
* Then things will sort and match correctly.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testBasicUsage() throws Exception {
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "tr")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(settings);
|
||||||
|
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
assertCollatesToSame(filterFactory, "I WİLL USE TURKİSH CASING", "ı will use turkish casıng");
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test usage of the decomposition option for unicode normalization.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testNormalization() throws IOException {
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "tr")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||||
|
.put("index.analysis.filter.myCollator.decomposition", "canonical")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(settings);
|
||||||
|
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
assertCollatesToSame(filterFactory, "I W\u0049\u0307LL USE TURKİSH CASING", "ı will use turkish casıng");
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test secondary strength, for english case is not significant.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testSecondaryStrength() throws IOException {
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "en")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "secondary")
|
||||||
|
.put("index.analysis.filter.myCollator.decomposition", "no")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(settings);
|
||||||
|
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
assertCollatesToSame(filterFactory, "TESTING", "testing");
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Setting alternate=shifted to shift whitespace, punctuation and symbols
|
||||||
|
* to quaternary level
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testIgnorePunctuation() throws IOException {
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "en")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||||
|
.put("index.analysis.filter.myCollator.alternate", "shifted")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(settings);
|
||||||
|
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
assertCollatesToSame(filterFactory, "foo-bar", "foo bar");
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Setting alternate=shifted and variableTop to shift whitespace, but not
|
||||||
|
* punctuation or symbols, to quaternary level
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testIgnoreWhitespace() throws IOException {
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "en")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||||
|
.put("index.analysis.filter.myCollator.alternate", "shifted")
|
||||||
|
.put("index.analysis.filter.myCollator.variableTop", " ")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(settings);
|
||||||
|
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
assertCollatesToSame(filterFactory, "foo bar", "foobar");
|
||||||
|
// now assert that punctuation still matters: foo-bar < foo bar
|
||||||
|
assertCollation(filterFactory, "foo-bar", "foo bar", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Setting numeric to encode digits with numeric value, so that
|
||||||
|
* foobar-9 sorts before foobar-10
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testNumerics() throws IOException {
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "en")
|
||||||
|
.put("index.analysis.filter.myCollator.numeric", "true")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(settings);
|
||||||
|
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
assertCollation(filterFactory, "foobar-9", "foobar-10", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Setting caseLevel=true to create an additional case level between
|
||||||
|
* secondary and tertiary
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testIgnoreAccentsButNotCase() throws IOException {
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "en")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||||
|
.put("index.analysis.filter.myCollator.caseLevel", "true")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(settings);
|
||||||
|
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
assertCollatesToSame(filterFactory, "résumé", "resume");
|
||||||
|
assertCollatesToSame(filterFactory, "Résumé", "Resume");
|
||||||
|
// now assert that case still matters: resume < Resume
|
||||||
|
assertCollation(filterFactory, "resume", "Resume", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Setting caseFirst=upper to cause uppercase strings to sort
|
||||||
|
* before lowercase ones.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testUpperCaseFirst() throws IOException {
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.language", "en")
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "tertiary")
|
||||||
|
.put("index.analysis.filter.myCollator.caseFirst", "upper")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(settings);
|
||||||
|
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
assertCollation(filterFactory, "Resume", "resume", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For german, you might want oe to sort and match with o umlaut.
|
||||||
|
* This is not the default, but you can make a customized ruleset to do this.
|
||||||
|
*
|
||||||
|
* The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
|
||||||
|
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testCustomRules() throws Exception {
|
||||||
|
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
|
||||||
|
String DIN5007_2_tailorings =
|
||||||
|
"& ae , a\u0308 & AE , A\u0308"+
|
||||||
|
"& oe , o\u0308 & OE , O\u0308"+
|
||||||
|
"& ue , u\u0308 & UE , u\u0308";
|
||||||
|
|
||||||
|
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
|
||||||
|
String tailoredRules = tailoredCollator.getRules();
|
||||||
|
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||||
|
.put("index.analysis.filter.myCollator.rules", tailoredRules)
|
||||||
|
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(settings);
|
||||||
|
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||||
|
assertCollatesToSame(filterFactory, "Töne", "Toene");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertCollatesToSame(TokenFilterFactory factory, String string1, String string2) throws IOException {
|
||||||
|
assertCollation(factory, string1, string2, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertCollation(TokenFilterFactory factory, String string1, String string2, int comparison) throws IOException {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader(string1));
|
||||||
|
TokenStream stream1 = factory.create(tokenizer);
|
||||||
|
|
||||||
|
tokenizer = new KeywordTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader(string2));
|
||||||
|
TokenStream stream2 = factory.create(tokenizer);
|
||||||
|
|
||||||
|
assertCollation(stream1, stream2, comparison);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
|
||||||
|
CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
|
||||||
|
CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
|
stream1.reset();
|
||||||
|
stream2.reset();
|
||||||
|
|
||||||
|
assertThat(stream1.incrementToken(), equalTo(true));
|
||||||
|
assertThat(stream2.incrementToken(), equalTo(true));
|
||||||
|
assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison)));
|
||||||
|
assertThat(stream1.incrementToken(), equalTo(false));
|
||||||
|
assertThat(stream2.incrementToken(), equalTo(false));
|
||||||
|
|
||||||
|
stream1.end();
|
||||||
|
stream2.end();
|
||||||
|
|
||||||
|
stream1.close();
|
||||||
|
stream2.close();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,251 @@
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
import org.apache.lucene.util.TimeUnits;
|
||||||
|
import org.elasticsearch.test.ElasticsearchThreadFilter;
|
||||||
|
import org.elasticsearch.test.junit.listeners.ReproduceInfoPrinter;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
|
import com.carrotsearch.randomizedtesting.annotations.Listeners;
|
||||||
|
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
|
||||||
|
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
|
||||||
|
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
|
||||||
|
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope;
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated Remove when IndexableBinaryStringTools is removed.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
@Listeners({
|
||||||
|
ReproduceInfoPrinter.class
|
||||||
|
})
|
||||||
|
@ThreadLeakFilters(defaultFilters = true, filters = {ElasticsearchThreadFilter.class})
|
||||||
|
@ThreadLeakScope(Scope.NONE)
|
||||||
|
@TimeoutSuite(millis = TimeUnits.HOUR)
|
||||||
|
@LuceneTestCase.SuppressSysoutChecks(bugUrl = "we log a lot on purpose")
|
||||||
|
public class TestIndexableBinaryStringTools extends LuceneTestCase {
|
||||||
|
private static int NUM_RANDOM_TESTS;
|
||||||
|
private static int MAX_RANDOM_BINARY_LENGTH;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() throws Exception {
|
||||||
|
NUM_RANDOM_TESTS = atLeast(200);
|
||||||
|
MAX_RANDOM_BINARY_LENGTH = atLeast(300);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSingleBinaryRoundTrip() {
|
||||||
|
byte[] binary = new byte[] { (byte) 0x23, (byte) 0x98, (byte) 0x13,
|
||||||
|
(byte) 0xE4, (byte) 0x76, (byte) 0x41, (byte) 0xB2, (byte) 0xC9,
|
||||||
|
(byte) 0x7F, (byte) 0x0A, (byte) 0xA6, (byte) 0xD8 };
|
||||||
|
|
||||||
|
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
|
||||||
|
binary.length);
|
||||||
|
char encoded[] = new char[encodedLen];
|
||||||
|
IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
|
||||||
|
encoded.length);
|
||||||
|
|
||||||
|
int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
|
||||||
|
encoded.length);
|
||||||
|
byte decoded[] = new byte[decodedLen];
|
||||||
|
IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
|
||||||
|
decoded.length);
|
||||||
|
|
||||||
|
assertEquals("Round trip decode/decode returned different results:"
|
||||||
|
+ System.getProperty("line.separator") + "original: "
|
||||||
|
+ binaryDump(binary, binary.length)
|
||||||
|
+ System.getProperty("line.separator") + " encoded: "
|
||||||
|
+ charArrayDump(encoded, encoded.length)
|
||||||
|
+ System.getProperty("line.separator") + " decoded: "
|
||||||
|
+ binaryDump(decoded, decoded.length),
|
||||||
|
binaryDump(binary, binary.length), binaryDump(decoded, decoded.length));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEncodedSortability() {
|
||||||
|
byte[] originalArray1 = new byte[MAX_RANDOM_BINARY_LENGTH];
|
||||||
|
char[] originalString1 = new char[MAX_RANDOM_BINARY_LENGTH];
|
||||||
|
char[] encoded1 = new char[MAX_RANDOM_BINARY_LENGTH * 10];
|
||||||
|
byte[] original2 = new byte[MAX_RANDOM_BINARY_LENGTH];
|
||||||
|
char[] originalString2 = new char[MAX_RANDOM_BINARY_LENGTH];
|
||||||
|
char[] encoded2 = new char[MAX_RANDOM_BINARY_LENGTH * 10];
|
||||||
|
|
||||||
|
for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) {
|
||||||
|
int numBytes1 = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
|
||||||
|
|
||||||
|
for (int byteNum = 0; byteNum < numBytes1; ++byteNum) {
|
||||||
|
int randomInt = random().nextInt(0x100);
|
||||||
|
originalArray1[byteNum] = (byte) randomInt;
|
||||||
|
originalString1[byteNum] = (char) randomInt;
|
||||||
|
}
|
||||||
|
|
||||||
|
int numBytes2 = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
|
||||||
|
|
||||||
|
for (int byteNum = 0; byteNum < numBytes2; ++byteNum) {
|
||||||
|
int randomInt = random().nextInt(0x100);
|
||||||
|
original2[byteNum] = (byte) randomInt;
|
||||||
|
originalString2[byteNum] = (char) randomInt;
|
||||||
|
}
|
||||||
|
int originalComparison = new String(originalString1, 0, numBytes1)
|
||||||
|
.compareTo(new String(originalString2, 0, numBytes2));
|
||||||
|
originalComparison = originalComparison < 0 ? -1
|
||||||
|
: originalComparison > 0 ? 1 : 0;
|
||||||
|
|
||||||
|
int encodedLen1 = IndexableBinaryStringTools.getEncodedLength(
|
||||||
|
originalArray1, 0, numBytes1);
|
||||||
|
if (encodedLen1 > encoded1.length)
|
||||||
|
encoded1 = new char[ArrayUtil.oversize(encodedLen1, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||||
|
IndexableBinaryStringTools.encode(originalArray1, 0, numBytes1, encoded1,
|
||||||
|
0, encodedLen1);
|
||||||
|
|
||||||
|
int encodedLen2 = IndexableBinaryStringTools.getEncodedLength(original2,
|
||||||
|
0, numBytes2);
|
||||||
|
if (encodedLen2 > encoded2.length)
|
||||||
|
encoded2 = new char[ArrayUtil.oversize(encodedLen2, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||||
|
IndexableBinaryStringTools.encode(original2, 0, numBytes2, encoded2, 0,
|
||||||
|
encodedLen2);
|
||||||
|
|
||||||
|
int encodedComparison = new String(encoded1, 0, encodedLen1)
|
||||||
|
.compareTo(new String(encoded2, 0, encodedLen2));
|
||||||
|
encodedComparison = encodedComparison < 0 ? -1
|
||||||
|
: encodedComparison > 0 ? 1 : 0;
|
||||||
|
|
||||||
|
assertEquals("Test #" + (testNum + 1)
|
||||||
|
+ ": Original bytes and encoded chars compare differently:"
|
||||||
|
+ System.getProperty("line.separator") + " binary 1: "
|
||||||
|
+ binaryDump(originalArray1, numBytes1)
|
||||||
|
+ System.getProperty("line.separator") + " binary 2: "
|
||||||
|
+ binaryDump(original2, numBytes2)
|
||||||
|
+ System.getProperty("line.separator") + "encoded 1: "
|
||||||
|
+ charArrayDump(encoded1, encodedLen1)
|
||||||
|
+ System.getProperty("line.separator") + "encoded 2: "
|
||||||
|
+ charArrayDump(encoded2, encodedLen2)
|
||||||
|
+ System.getProperty("line.separator"), originalComparison,
|
||||||
|
encodedComparison);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmptyInput() {
|
||||||
|
byte[] binary = new byte[0];
|
||||||
|
|
||||||
|
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
|
||||||
|
binary.length);
|
||||||
|
char[] encoded = new char[encodedLen];
|
||||||
|
IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
|
||||||
|
encoded.length);
|
||||||
|
|
||||||
|
int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
|
||||||
|
encoded.length);
|
||||||
|
byte[] decoded = new byte[decodedLen];
|
||||||
|
IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
|
||||||
|
decoded.length);
|
||||||
|
|
||||||
|
assertEquals("decoded empty input was not empty", decoded.length, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAllNullInput() {
|
||||||
|
byte[] binary = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||||
|
|
||||||
|
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
|
||||||
|
binary.length);
|
||||||
|
char encoded[] = new char[encodedLen];
|
||||||
|
IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
|
||||||
|
encoded.length);
|
||||||
|
|
||||||
|
int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
|
||||||
|
encoded.length);
|
||||||
|
byte[] decoded = new byte[decodedLen];
|
||||||
|
IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
|
||||||
|
decoded.length);
|
||||||
|
|
||||||
|
assertEquals("Round trip decode/decode returned different results:"
|
||||||
|
+ System.getProperty("line.separator") + " original: "
|
||||||
|
+ binaryDump(binary, binary.length)
|
||||||
|
+ System.getProperty("line.separator") + "decodedBuf: "
|
||||||
|
+ binaryDump(decoded, decoded.length),
|
||||||
|
binaryDump(binary, binary.length), binaryDump(decoded, decoded.length));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandomBinaryRoundTrip() {
|
||||||
|
byte[] binary = new byte[MAX_RANDOM_BINARY_LENGTH];
|
||||||
|
char[] encoded = new char[MAX_RANDOM_BINARY_LENGTH * 10];
|
||||||
|
byte[] decoded = new byte[MAX_RANDOM_BINARY_LENGTH];
|
||||||
|
for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) {
|
||||||
|
int numBytes = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
|
||||||
|
|
||||||
|
for (int byteNum = 0; byteNum < numBytes; ++byteNum) {
|
||||||
|
binary[byteNum] = (byte) random().nextInt(0x100);
|
||||||
|
}
|
||||||
|
|
||||||
|
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
|
||||||
|
numBytes);
|
||||||
|
if (encoded.length < encodedLen)
|
||||||
|
encoded = new char[ArrayUtil.oversize(encodedLen, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||||
|
IndexableBinaryStringTools.encode(binary, 0, numBytes, encoded, 0,
|
||||||
|
encodedLen);
|
||||||
|
|
||||||
|
int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
|
||||||
|
encodedLen);
|
||||||
|
IndexableBinaryStringTools.decode(encoded, 0, encodedLen, decoded, 0,
|
||||||
|
decodedLen);
|
||||||
|
|
||||||
|
assertEquals("Test #" + (testNum + 1)
|
||||||
|
+ ": Round trip decode/decode returned different results:"
|
||||||
|
+ System.getProperty("line.separator") + " original: "
|
||||||
|
+ binaryDump(binary, numBytes) + System.getProperty("line.separator")
|
||||||
|
+ "encodedBuf: " + charArrayDump(encoded, encodedLen)
|
||||||
|
+ System.getProperty("line.separator") + "decodedBuf: "
|
||||||
|
+ binaryDump(decoded, decodedLen), binaryDump(binary, numBytes),
|
||||||
|
binaryDump(decoded, decodedLen));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String binaryDump(byte[] binary, int numBytes) {
|
||||||
|
StringBuilder buf = new StringBuilder();
|
||||||
|
for (int byteNum = 0 ; byteNum < numBytes ; ++byteNum) {
|
||||||
|
String hex = Integer.toHexString(binary[byteNum] & 0xFF);
|
||||||
|
if (hex.length() == 1) {
|
||||||
|
buf.append('0');
|
||||||
|
}
|
||||||
|
buf.append(hex.toUpperCase(Locale.ROOT));
|
||||||
|
if (byteNum < numBytes - 1) {
|
||||||
|
buf.append(' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return buf.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String charArrayDump(char[] charArray, int numBytes) {
|
||||||
|
StringBuilder buf = new StringBuilder();
|
||||||
|
for (int charNum = 0 ; charNum < numBytes ; ++charNum) {
|
||||||
|
String hex = Integer.toHexString(charArray[charNum]);
|
||||||
|
for (int digit = 0 ; digit < 4 - hex.length() ; ++digit) {
|
||||||
|
buf.append('0');
|
||||||
|
}
|
||||||
|
buf.append(hex.toUpperCase(Locale.ROOT));
|
||||||
|
if (charNum < numBytes - 1) {
|
||||||
|
buf.append(' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return buf.toString();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue