add back collation (still the way it was working before)

2014-11-06 03:11:20 -05:00 · 2014-11-06 03:11:20 -05:00 · e45308d9e7
parent c2c0345837
commit e45308d9e7
8 changed files with 950 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -101,6 +101,81 @@ The Following example exempts Swedish characters from the folding. Note that the
 }
 ```

+ICU Collation
+-------------
+
+Uses collation token filter. Allows to either specify the rules for collation
+(defined [here](http://www.icu-project.org/userguide/Collate_Customization.html)) using the `rules` parameter
+(can point to a location or expressed in the settings, location can be relative to config location), or using the
+`language` parameter (further specialized by country and variant). By default registers under `icu_collation` or
+`icuCollation` and uses the default locale.
+
+Here is a sample settings:
+
+```js
+{
+    "index" : {
+        "analysis" : {
+            "analyzer" : {
+                "collation" : {
+                    "tokenizer" : "keyword",
+                    "filter" : ["icu_collation"]
+                }
+            }
+        }
+    }
+}
+```
+
+And here is a sample of custom collation:
+
+```js
+{
+    "index" : {
+        "analysis" : {
+            "analyzer" : {
+                "collation" : {
+                    "tokenizer" : "keyword",
+                    "filter" : ["myCollator"]
+                }
+            },
+            "filter" : {
+                "myCollator" : {
+                    "type" : "icu_collation",
+                    "language" : "en"
+                }
+            }
+        }
+    }
+}
+```
+
+Optional options:
+* `strength` - The strength property determines the minimum level of difference considered significant during comparison.
+ The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
+ Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
+ See [ICU Collation](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html) documentation for a more detailed
+ explanation for the specific values.
+* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
+`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
+normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
+before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
+faster and more complete collation behavior. Since a great many of the world's languages do not require text
+normalization, most locales set `no` as the default decomposition mode.
+
+Expert options:
+* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
+ to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
+* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
+ strength is set to `primary` this will ignore accent differences.
+* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
+ for strength `tertiary`.
+* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
+ example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
+* `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
+* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
+ and Hiragana characters in `quaternary` strength .
+
 ICU Tokenizer
 -------------

--- a/src/main/java/org/elasticsearch/index/analysis/ICUCollationKeyFilter.java
+++ b/src/main/java/org/elasticsearch/index/analysis/ICUCollationKeyFilter.java
@ -0,0 +1,109 @@
+package org.elasticsearch.index.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RawCollationKey;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+import java.io.IOException;
+
+/**
+ * <p>
+ *   Converts each token into its {@link com.ibm.icu.text.CollationKey}, and
+ *   then encodes the CollationKey with {@link IndexableBinaryStringTools}, to
+ *   allow it to be stored as an index term.
+ * </p>
+ * <p>
+ *   <strong>WARNING:</strong> Make sure you use exactly the same Collator at
+ *   index and query time -- CollationKeys are only comparable when produced by
+ *   the same Collator.  {@link com.ibm.icu.text.RuleBasedCollator}s are 
+ *   independently versioned, so it is safe to search against stored
+ *   CollationKeys if the following are exactly the same (best practice is
+ *   to store this information with the index and check that they remain the
+ *   same at query time):
+ * </p>
+ * <ol>
+ *   <li>
+ *     Collator version - see {@link Collator#getVersion()}
+ *   </li>
+ *   <li>
+ *     The collation strength used - see {@link Collator#setStrength(int)}
+ *   </li>
+ * </ol> 
+ * <p>
+ *   CollationKeys generated by ICU Collators are not compatible with those
+ *   generated by java.text.Collators.  Specifically, if you use 
+ *   ICUCollationKeyFilter to generate index terms, do not use 
+ *   {@code CollationKeyFilter} on the query side, or vice versa.
+ * </p>
+ * <p>
+ *   ICUCollationKeyFilter is significantly faster and generates significantly
+ *   shorter keys than CollationKeyFilter.  See
+ *   <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
+ *   >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
+ *   generation timing and key length comparisons between ICU4J and
+ *   java.text.Collator over several languages.
+ * </p>
+ *  @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes
+ *  terms directly as bytes. This filter WAS removed in Lucene 5.0
+ */
+@Deprecated
+public final class ICUCollationKeyFilter extends TokenFilter {
+  private Collator collator = null;
+  private RawCollationKey reusableKey = new RawCollationKey();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  /**
+   * 
+   * @param input Source token stream
+   * @param collator CollationKey generator
+   */
+  public ICUCollationKeyFilter(TokenStream input, Collator collator) {
+    super(input);
+    // clone the collator: see http://userguide.icu-project.org/collation/architecture
+    try {
+      this.collator = (Collator) collator.clone();
+    } catch (CloneNotSupportedException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      char[] termBuffer = termAtt.buffer();
+      String termText = new String(termBuffer, 0, termAtt.length());
+      collator.getRawCollationKey(termText, reusableKey);
+      int encodedLength = IndexableBinaryStringTools.getEncodedLength(
+          reusableKey.bytes, 0, reusableKey.size);
+      if (encodedLength > termBuffer.length) {
+        termAtt.resizeBuffer(encodedLength);
+      }
+      termAtt.setLength(encodedLength);
+      IndexableBinaryStringTools.encode(reusableKey.bytes, 0, reusableKey.size,
+          termAtt.buffer(), 0, encodedLength);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java
@ -173,7 +173,6 @@ public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {

    @Override
    public TokenStream create(TokenStream tokenStream) {
-        throw new UnsupportedOperationException("i was deprecated in lucene 4, and now i'm gone");
-        // TODO: lucene does sort keys as binary keys since 4.x
+        return new ICUCollationKeyFilter(tokenStream, collator);
    }
 }
--- a/src/main/java/org/elasticsearch/index/analysis/IndexableBinaryStringTools.java
+++ b/src/main/java/org/elasticsearch/index/analysis/IndexableBinaryStringTools.java
@ -0,0 +1,241 @@
+package org.elasticsearch.index.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadoc
+
+/**
+ * Provides support for converting byte sequences to Strings and back again.
+ * The resulting Strings preserve the original byte sequences' sort order.
+ * <p/>
+ * The Strings are constructed using a Base 8000h encoding of the original
+ * binary data - each char of an encoded String represents a 15-bit chunk
+ * from the byte sequence.  Base 8000h was chosen because it allows for all
+ * lower 15 bits of char to be used without restriction; the surrogate range 
+ * [U+D8000-U+DFFF] does not represent valid chars, and would require
+ * complicated handling to avoid them and allow use of char's high bit.
+ * <p/>
+ * Although unset bits are used as padding in the final char, the original
+ * byte sequence could contain trailing bytes with no set bits (null bytes):
+ * padding is indistinguishable from valid information.  To overcome this
+ * problem, a char is appended, indicating the number of encoded bytes in the
+ * final content char.
+ * <p/>
+ *
+ * @lucene.experimental
+ * @deprecated Implement {@link TermToBytesRefAttribute} and store bytes directly
+ * instead. This class WAS removed in Lucene 5.0
+ */
+@Deprecated
+public final class IndexableBinaryStringTools {
+
+  private static final CodingCase[] CODING_CASES = {
+    // CodingCase(int initialShift, int finalShift)
+    new CodingCase( 7, 1   ),
+    // CodingCase(int initialShift, int middleShift, int finalShift)
+    new CodingCase(14, 6, 2),
+    new CodingCase(13, 5, 3),
+    new CodingCase(12, 4, 4),
+    new CodingCase(11, 3, 5),
+    new CodingCase(10, 2, 6),
+    new CodingCase( 9, 1, 7),
+    new CodingCase( 8, 0   )
+  };
+
+  // Export only static methods
+  private IndexableBinaryStringTools() {}
+
+  /**
+   * Returns the number of chars required to encode the given bytes.
+   * 
+   * @param inputArray byte sequence to be encoded
+   * @param inputOffset initial offset into inputArray
+   * @param inputLength number of bytes in inputArray
+   * @return The number of chars required to encode the number of bytes.
+   */
+  public static int getEncodedLength(byte[] inputArray, int inputOffset,
+      int inputLength) {
+    // Use long for intermediaries to protect against overflow
+    return (int)((8L * inputLength + 14L) / 15L) + 1;
+  }
+
+  /**
+   * Returns the number of bytes required to decode the given char sequence.
+   * 
+   * @param encoded char sequence to be decoded
+   * @param offset initial offset
+   * @param length number of characters
+   * @return The number of bytes required to decode the given char sequence
+   */
+  public static int getDecodedLength(char[] encoded, int offset, int length) {
+    final int numChars = length - 1;
+    if (numChars <= 0) {
+      return 0;
+    } else {
+      // Use long for intermediaries to protect against overflow
+      final long numFullBytesInFinalChar = encoded[offset + length - 1];
+      final long numEncodedChars = numChars - 1;
+      return (int)((numEncodedChars * 15L + 7L) / 8L + numFullBytesInFinalChar);
+    }
+  }
+
+  /**
+   * Encodes the input byte sequence into the output char sequence.  Before
+   * calling this method, ensure that the output array has sufficient
+   * capacity by calling {@link #getEncodedLength(byte[], int, int)}.
+   * 
+   * @param inputArray byte sequence to be encoded
+   * @param inputOffset initial offset into inputArray
+   * @param inputLength number of bytes in inputArray
+   * @param outputArray char sequence to store encoded result
+   * @param outputOffset initial offset into outputArray
+   * @param outputLength length of output, must be getEncodedLength
+   */
+  public static void encode(byte[] inputArray, int inputOffset,
+      int inputLength, char[] outputArray, int outputOffset, int outputLength) {
+    assert (outputLength == getEncodedLength(inputArray, inputOffset,
+        inputLength));
+    if (inputLength > 0) {
+      int inputByteNum = inputOffset;
+      int caseNum = 0;
+      int outputCharNum = outputOffset;
+      CodingCase codingCase;
+      for (; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength; ++outputCharNum) {
+        codingCase = CODING_CASES[caseNum];
+        if (2 == codingCase.numBytes) {
+          outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
+              + (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
+        } else { // numBytes is 3
+          outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
+              + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)
+              + (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
+        }
+        inputByteNum += codingCase.advanceBytes;
+        if (++caseNum == CODING_CASES.length) {
+          caseNum = 0;
+        }
+      }
+      // Produce final char (if any) and trailing count chars.
+      codingCase = CODING_CASES[caseNum];
+
+      if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3
+        outputArray[outputCharNum++] = (char) ((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) & (short) 0x7FFF);
+        // Add trailing char containing the number of full bytes in final char
+        outputArray[outputCharNum++] = (char) 1;
+      } else if (inputByteNum < inputLength) {
+        outputArray[outputCharNum++] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) & (short) 0x7FFF);
+        // Add trailing char containing the number of full bytes in final char
+        outputArray[outputCharNum++] = caseNum == 0 ? (char) 1 : (char) 0;
+      } else { // No left over bits - last char is completely filled.
+        // Add trailing char containing the number of full bytes in final char
+        outputArray[outputCharNum++] = (char) 1;
+      }
+    }
+  }
+
+  /**
+   * Decodes the input char sequence into the output byte sequence. Before
+   * calling this method, ensure that the output array has sufficient capacity
+   * by calling {@link #getDecodedLength(char[], int, int)}.
+   * 
+   * @param inputArray char sequence to be decoded
+   * @param inputOffset initial offset into inputArray
+   * @param inputLength number of chars in inputArray
+   * @param outputArray byte sequence to store encoded result
+   * @param outputOffset initial offset into outputArray
+   * @param outputLength length of output, must be
+   *        getDecodedLength(inputArray, inputOffset, inputLength)
+   */
+  public static void decode(char[] inputArray, int inputOffset,
+      int inputLength, byte[] outputArray, int outputOffset, int outputLength) {
+    assert (outputLength == getDecodedLength(inputArray, inputOffset,
+        inputLength));
+    final int numInputChars = inputLength - 1;
+    final int numOutputBytes = outputLength;
+
+    if (numOutputBytes > 0) {
+      int caseNum = 0;
+      int outputByteNum = outputOffset;
+      int inputCharNum = inputOffset;
+      short inputChar;
+      CodingCase codingCase;
+      for (; inputCharNum < numInputChars - 1; ++inputCharNum) {
+        codingCase = CODING_CASES[caseNum];
+        inputChar = (short) inputArray[inputCharNum];
+        if (2 == codingCase.numBytes) {
+          if (0 == caseNum) {
+            outputArray[outputByteNum] = (byte) (inputChar >>> codingCase.initialShift);
+          } else {
+            outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
+          }
+          outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
+        } else { // numBytes is 3
+          outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
+          outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
+          outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
+        }
+        outputByteNum += codingCase.advanceBytes;
+        if (++caseNum == CODING_CASES.length) {
+          caseNum = 0;
+        }
+      }
+      // Handle final char
+      inputChar = (short) inputArray[inputCharNum];
+      codingCase = CODING_CASES[caseNum];
+      if (0 == caseNum) {
+        outputArray[outputByteNum] = 0;
+      }
+      outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
+      final int bytesLeft = numOutputBytes - outputByteNum;
+      if (bytesLeft > 1) {
+        if (2 == codingCase.numBytes) {
+          outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) >>> codingCase.finalShift);
+        } else { // numBytes is 3
+          outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
+          if (bytesLeft > 2) {
+            outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
+          }
+        }
+      }
+    }
+  }
+
+  static class CodingCase {
+    int numBytes, initialShift, middleShift, finalShift, advanceBytes = 2;
+    short middleMask, finalMask;
+
+    CodingCase(int initialShift, int middleShift, int finalShift) {
+      this.numBytes = 3;
+      this.initialShift = initialShift;
+      this.middleShift = middleShift;
+      this.finalShift = finalShift;
+      this.finalMask = (short)((short)0xFF >>> finalShift);
+      this.middleMask = (short)((short)0xFF << middleShift);
+    }
+
+    CodingCase(int initialShift, int finalShift) {
+      this.numBytes = 2;
+      this.initialShift = initialShift;
+      this.finalShift = finalShift;
+      this.finalMask = (short)((short)0xFF >>> finalShift);
+      if (finalShift != 0) {
+        advanceBytes = 1; 
+      }
+    }
+  }
+}
--- a/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java
+++ b/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java
@ -19,6 +19,7 @@

 package org.elasticsearch.indices.analysis;

+import com.ibm.icu.text.Collator;
 import com.ibm.icu.text.Normalizer2;
 import com.ibm.icu.text.Transliterator;
 import org.apache.lucene.analysis.TokenStream;
@ -29,6 +30,7 @@ import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
 import org.elasticsearch.common.component.AbstractComponent;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.analysis.ICUCollationKeyFilter;
 import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory;
 import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
@ -81,6 +83,18 @@ public class IcuIndicesAnalysis extends AbstractComponent {
            }
        }));

+        indicesAnalysisService.tokenFilterFactories().put("icu_collation", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return "icu_collation";
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                return new ICUCollationKeyFilter(tokenStream, Collator.getInstance());
+            }
+        }));
+
        indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
            @Override
            public String name() {
--- a/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java
@ -52,8 +52,10 @@ public class ICUIntegrationTests extends ElasticsearchIntegrationTest {
        Settings settings = ImmutableSettings.builder()
                .put(super.indexSettings())
                .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
-                .putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "my_folding")
-                .put("index.analysis.filter.my_folding.type", "icu_folding")
+                .putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_collator")
+                .put("index.analysis.filter.my_collator.type", "icu_collation")
+                .put("index.analysis.filter.my_collator.language", "en")
+                .put("index.analysis.filter.my_collator.strength", "primary")
                .build();

        return settings;
--- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java
@ -0,0 +1,255 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.elasticsearch.common.settings.ImmutableSettings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.test.ElasticsearchTestCase;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService;
+import static org.hamcrest.Matchers.equalTo;
+
+// Tests borrowed from Solr's Icu collation key filter factory test.
+public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase {
+
+    /*
+    * Turkish has some funny casing.
+    * This test shows how you can solve this kind of thing easily with collation.
+    * Instead of using LowerCaseFilter, use a turkish collator with primary strength.
+    * Then things will sort and match correctly.
+    */
+    @Test
+    public void testBasicUsage() throws Exception {
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "tr")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "I WİLL USE TURKİSH CASING", "ı will use turkish casıng");
+    }
+
+    /*
+    * Test usage of the decomposition option for unicode normalization.
+    */
+    @Test
+    public void testNormalization() throws IOException {
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "tr")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .put("index.analysis.filter.myCollator.decomposition", "canonical")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "I W\u0049\u0307LL USE TURKİSH CASING", "ı will use turkish casıng");
+    }
+
+    /*
+    * Test secondary strength, for english case is not significant.
+    */
+    @Test
+    public void testSecondaryStrength() throws IOException {
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "secondary")
+                .put("index.analysis.filter.myCollator.decomposition", "no")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "TESTING", "testing");
+    }
+
+    /*
+    * Setting alternate=shifted to shift whitespace, punctuation and symbols
+    * to quaternary level
+    */
+    @Test
+    public void testIgnorePunctuation() throws IOException {
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .put("index.analysis.filter.myCollator.alternate", "shifted")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "foo-bar", "foo bar");
+    }
+
+    /*
+    * Setting alternate=shifted and variableTop to shift whitespace, but not
+    * punctuation or symbols, to quaternary level
+    */
+    @Test
+    public void testIgnoreWhitespace() throws IOException {
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .put("index.analysis.filter.myCollator.alternate", "shifted")
+                .put("index.analysis.filter.myCollator.variableTop", " ")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "foo bar", "foobar");
+        // now assert that punctuation still matters: foo-bar < foo bar
+        assertCollation(filterFactory, "foo-bar", "foo bar", -1);
+    }
+
+    /*
+    * Setting numeric to encode digits with numeric value, so that
+    * foobar-9 sorts before foobar-10
+    */
+    @Test
+    public void testNumerics() throws IOException {
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.numeric", "true")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollation(filterFactory, "foobar-9", "foobar-10", -1);
+    }
+
+    /*
+    * Setting caseLevel=true to create an additional case level between
+    * secondary and tertiary
+    */
+    @Test
+    public void testIgnoreAccentsButNotCase() throws IOException {
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .put("index.analysis.filter.myCollator.caseLevel", "true")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "résumé", "resume");
+        assertCollatesToSame(filterFactory, "Résumé", "Resume");
+        // now assert that case still matters: resume < Resume
+        assertCollation(filterFactory, "resume", "Resume", -1);
+    }
+
+    /*
+    * Setting caseFirst=upper to cause uppercase strings to sort
+    * before lowercase ones.
+    */
+    @Test
+    public void testUpperCaseFirst() throws IOException {
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "tertiary")
+                .put("index.analysis.filter.myCollator.caseFirst", "upper")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollation(filterFactory, "Resume", "resume", -1);
+    }
+
+    /*
+    * For german, you might want oe to sort and match with o umlaut.
+    * This is not the default, but you can make a customized ruleset to do this.
+    *
+    * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
+    *  http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
+    */
+    @Test
+    public void testCustomRules() throws Exception {
+        RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
+        String DIN5007_2_tailorings =
+                "& ae , a\u0308 & AE , A\u0308"+
+                        "& oe , o\u0308 & OE , O\u0308"+
+                        "& ue , u\u0308 & UE , u\u0308";
+
+        RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
+        String tailoredRules = tailoredCollator.getRules();
+
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.rules", tailoredRules)
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "Töne", "Toene");
+    }
+    
+    private void assertCollatesToSame(TokenFilterFactory factory, String string1, String string2) throws IOException {
+        assertCollation(factory, string1, string2, 0);
+    }
+    
+    private void assertCollation(TokenFilterFactory factory, String string1, String string2, int comparison) throws IOException {
+        Tokenizer tokenizer = new KeywordTokenizer();
+        tokenizer.setReader(new StringReader(string1));
+        TokenStream stream1 = factory.create(tokenizer);
+    
+        tokenizer = new KeywordTokenizer();
+        tokenizer.setReader(new StringReader(string2));
+        TokenStream stream2 = factory.create(tokenizer);
+      
+        assertCollation(stream1, stream2, comparison);
+    }
+
+    private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
+        CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
+        CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);
+
+        stream1.reset();
+        stream2.reset();
+
+        assertThat(stream1.incrementToken(), equalTo(true));
+        assertThat(stream2.incrementToken(), equalTo(true));
+        assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison)));
+        assertThat(stream1.incrementToken(), equalTo(false));
+        assertThat(stream2.incrementToken(), equalTo(false));
+        
+        stream1.end();
+        stream2.end();
+        
+        stream1.close();
+        stream2.close();
+    }
+}
--- a/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java
+++ b/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java
@ -0,0 +1,251 @@
+package org.elasticsearch.index.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.TimeUnits;
+import org.elasticsearch.test.ElasticsearchThreadFilter;
+import org.elasticsearch.test.junit.listeners.ReproduceInfoPrinter;
+import org.junit.BeforeClass;
+
+import com.carrotsearch.randomizedtesting.annotations.Listeners;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
+import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope;
+
+import java.util.Locale;
+
+/**
+ * @deprecated Remove when IndexableBinaryStringTools is removed.
+ */
+@Deprecated
+@Listeners({
+  ReproduceInfoPrinter.class
+})
+@ThreadLeakFilters(defaultFilters = true, filters = {ElasticsearchThreadFilter.class})
+@ThreadLeakScope(Scope.NONE)
+@TimeoutSuite(millis = TimeUnits.HOUR)
+@LuceneTestCase.SuppressSysoutChecks(bugUrl = "we log a lot on purpose")
+public class TestIndexableBinaryStringTools extends LuceneTestCase {
+  private static int NUM_RANDOM_TESTS;
+  private static int MAX_RANDOM_BINARY_LENGTH;
+  
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    NUM_RANDOM_TESTS = atLeast(200);
+    MAX_RANDOM_BINARY_LENGTH = atLeast(300);
+  }
+  
+  public void testSingleBinaryRoundTrip() {
+    byte[] binary = new byte[] { (byte) 0x23, (byte) 0x98, (byte) 0x13,
+        (byte) 0xE4, (byte) 0x76, (byte) 0x41, (byte) 0xB2, (byte) 0xC9,
+        (byte) 0x7F, (byte) 0x0A, (byte) 0xA6, (byte) 0xD8 };
+
+    int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
+        binary.length);
+    char encoded[] = new char[encodedLen];
+    IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
+        encoded.length);
+
+    int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
+        encoded.length);
+    byte decoded[] = new byte[decodedLen];
+    IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
+        decoded.length);
+
+    assertEquals("Round trip decode/decode returned different results:"
+        + System.getProperty("line.separator") + "original: "
+        + binaryDump(binary, binary.length)
+        + System.getProperty("line.separator") + " encoded: "
+        + charArrayDump(encoded, encoded.length)
+        + System.getProperty("line.separator") + " decoded: "
+        + binaryDump(decoded, decoded.length),
+        binaryDump(binary, binary.length), binaryDump(decoded, decoded.length));
+  }
+  
+  public void testEncodedSortability() {
+    byte[] originalArray1 = new byte[MAX_RANDOM_BINARY_LENGTH];
+    char[] originalString1 = new char[MAX_RANDOM_BINARY_LENGTH];
+    char[] encoded1 = new char[MAX_RANDOM_BINARY_LENGTH * 10];
+    byte[] original2 = new byte[MAX_RANDOM_BINARY_LENGTH];
+    char[] originalString2 = new char[MAX_RANDOM_BINARY_LENGTH];
+    char[] encoded2 = new char[MAX_RANDOM_BINARY_LENGTH * 10];
+
+    for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) {
+      int numBytes1 = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
+
+      for (int byteNum = 0; byteNum < numBytes1; ++byteNum) {
+        int randomInt = random().nextInt(0x100);
+        originalArray1[byteNum] = (byte) randomInt;
+        originalString1[byteNum] = (char) randomInt;
+      }
+
+      int numBytes2 = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
+
+      for (int byteNum = 0; byteNum < numBytes2; ++byteNum) {
+        int randomInt = random().nextInt(0x100);
+        original2[byteNum] = (byte) randomInt;
+        originalString2[byteNum] = (char) randomInt;
+      }
+      int originalComparison = new String(originalString1, 0, numBytes1)
+          .compareTo(new String(originalString2, 0, numBytes2));
+      originalComparison = originalComparison < 0 ? -1
+          : originalComparison > 0 ? 1 : 0;
+
+      int encodedLen1 = IndexableBinaryStringTools.getEncodedLength(
+          originalArray1, 0, numBytes1);
+      if (encodedLen1 > encoded1.length)
+        encoded1 = new char[ArrayUtil.oversize(encodedLen1, RamUsageEstimator.NUM_BYTES_CHAR)];
+      IndexableBinaryStringTools.encode(originalArray1, 0, numBytes1, encoded1,
+          0, encodedLen1);
+
+      int encodedLen2 = IndexableBinaryStringTools.getEncodedLength(original2,
+          0, numBytes2);
+      if (encodedLen2 > encoded2.length)
+        encoded2 = new char[ArrayUtil.oversize(encodedLen2, RamUsageEstimator.NUM_BYTES_CHAR)];
+      IndexableBinaryStringTools.encode(original2, 0, numBytes2, encoded2, 0,
+          encodedLen2);
+
+      int encodedComparison = new String(encoded1, 0, encodedLen1)
+          .compareTo(new String(encoded2, 0, encodedLen2));
+      encodedComparison = encodedComparison < 0 ? -1
+          : encodedComparison > 0 ? 1 : 0;
+
+      assertEquals("Test #" + (testNum + 1)
+          + ": Original bytes and encoded chars compare differently:"
+          + System.getProperty("line.separator") + " binary 1: "
+          + binaryDump(originalArray1, numBytes1)
+          + System.getProperty("line.separator") + " binary 2: "
+          + binaryDump(original2, numBytes2)
+          + System.getProperty("line.separator") + "encoded 1: "
+          + charArrayDump(encoded1, encodedLen1)
+          + System.getProperty("line.separator") + "encoded 2: "
+          + charArrayDump(encoded2, encodedLen2)
+          + System.getProperty("line.separator"), originalComparison,
+          encodedComparison);
+    }
+  }
+
+  public void testEmptyInput() {
+    byte[] binary = new byte[0];
+
+    int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
+        binary.length);
+    char[] encoded = new char[encodedLen];
+    IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
+        encoded.length);
+
+    int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
+        encoded.length);
+    byte[] decoded = new byte[decodedLen];
+    IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
+        decoded.length);
+
+    assertEquals("decoded empty input was not empty", decoded.length, 0);
+  }
+  
+  public void testAllNullInput() {
+    byte[] binary = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
+        binary.length);
+    char encoded[] = new char[encodedLen];
+    IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
+        encoded.length);
+
+    int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
+        encoded.length);
+    byte[] decoded = new byte[decodedLen];
+    IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
+        decoded.length);
+
+    assertEquals("Round trip decode/decode returned different results:"
+        + System.getProperty("line.separator") + "  original: "
+        + binaryDump(binary, binary.length)
+        + System.getProperty("line.separator") + "decodedBuf: "
+        + binaryDump(decoded, decoded.length),
+        binaryDump(binary, binary.length), binaryDump(decoded, decoded.length));
+  }
+  
+  public void testRandomBinaryRoundTrip() {
+    byte[] binary = new byte[MAX_RANDOM_BINARY_LENGTH];
+    char[] encoded = new char[MAX_RANDOM_BINARY_LENGTH * 10];
+    byte[] decoded = new byte[MAX_RANDOM_BINARY_LENGTH];
+    for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) {
+      int numBytes = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1                                                                   
+
+      for (int byteNum = 0; byteNum < numBytes; ++byteNum) {
+        binary[byteNum] = (byte) random().nextInt(0x100);
+      }
+
+      int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
+          numBytes);
+      if (encoded.length < encodedLen)
+        encoded = new char[ArrayUtil.oversize(encodedLen, RamUsageEstimator.NUM_BYTES_CHAR)];
+      IndexableBinaryStringTools.encode(binary, 0, numBytes, encoded, 0,
+          encodedLen);
+
+      int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
+          encodedLen);
+      IndexableBinaryStringTools.decode(encoded, 0, encodedLen, decoded, 0,
+          decodedLen);
+
+      assertEquals("Test #" + (testNum + 1)
+          + ": Round trip decode/decode returned different results:"
+          + System.getProperty("line.separator") + "  original: "
+          + binaryDump(binary, numBytes) + System.getProperty("line.separator")
+          + "encodedBuf: " + charArrayDump(encoded, encodedLen)
+          + System.getProperty("line.separator") + "decodedBuf: "
+          + binaryDump(decoded, decodedLen), binaryDump(binary, numBytes),
+          binaryDump(decoded, decodedLen));
+    }
+  }
+  
+  public String binaryDump(byte[] binary, int numBytes) {
+    StringBuilder buf = new StringBuilder();
+    for (int byteNum = 0 ; byteNum < numBytes ; ++byteNum) {
+      String hex = Integer.toHexString(binary[byteNum] & 0xFF);
+      if (hex.length() == 1) {
+        buf.append('0');
+      }
+      buf.append(hex.toUpperCase(Locale.ROOT));
+      if (byteNum < numBytes - 1) {
+        buf.append(' ');
+      }
+    }
+    return buf.toString();
+  }
+
+  public String charArrayDump(char[] charArray, int numBytes) {
+    StringBuilder buf = new StringBuilder();
+    for (int charNum = 0 ; charNum < numBytes ; ++charNum) {
+      String hex = Integer.toHexString(charArray[charNum]);
+      for (int digit = 0 ; digit < 4 - hex.length() ; ++digit) {
+        buf.append('0');
+      }
+      buf.append(hex.toUpperCase(Locale.ROOT));
+      if (charNum < numBytes - 1) {
+        buf.append(' ');
+      }
+    }
+    return buf.toString();
+  }
+}