migrate branch for analysis-icu

2015-06-05 13:12:26 +02:00 · 2015-06-05 13:12:26 +02:00 · f3228e394d
parent d0be9e5f7a ed3cc8d034
commit f3228e394d
22 changed files with 2264 additions and 0 deletions
--- a/plugins/analysis-icu/README.md
+++ b/plugins/analysis-icu/README.md
@ -0,0 +1,290 @@
+ICU Analysis for Elasticsearch
+==================================
+
+The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components.
+
+In order to install the plugin, simply run: 
+
+```sh
+bin/plugin install elasticsearch/elasticsearch-analysis-icu/2.5.0
+```
+
+You need to install a version matching your Elasticsearch version:
+
+| elasticsearch |  ICU Analysis Plugin  |   Docs     |  
+|---------------|-----------------------|------------|
+| master        |  Build from source    | See below  |
+| es-1.x        |  Build from source    | [2.6.0-SNAPSHOT](https://github.com/elastic/elasticsearch-analysis-icu/tree/es-1.x/#version-260-snapshot-for-elasticsearch-1x)  |
+| es-1.5        |  2.5.0                | [2.5.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.5.0/#version-250-for-elasticsearch-15)                  |
+|    es-1.4              |     2.4.3         | [2.4.3](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.3/#version-243-for-elasticsearch-14)                  |
+| < 1.4.5       |  2.4.2                | [2.4.2](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.4.2/#version-242-for-elasticsearch-14)                  |
+| < 1.4.3       |  2.4.1                | [2.4.1](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.4.1/#version-241-for-elasticsearch-14)                  |
+| es-1.3        |  2.3.0                | [2.3.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.3.0/#icu-analysis-for-elasticsearch)  |
+| es-1.2        |  2.2.0                | [2.2.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.2.0/#icu-analysis-for-elasticsearch)  |
+| es-1.1        |  2.1.0                | [2.1.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.1.0/#icu-analysis-for-elasticsearch)  |
+| es-1.0        |  2.0.0                | [2.0.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.0.0/#icu-analysis-for-elasticsearch)  |
+| es-0.90       |  1.13.0               | [1.13.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v1.13.0/#icu-analysis-for-elasticsearch)  |
+
+To build a `SNAPSHOT` version, you need to build it with Maven:
+
+```bash
+mvn clean install
+plugin --install analysis-icu \
+       --url file:target/releases/elasticsearch-analysis-icu-X.X.X-SNAPSHOT.zip
+```
+
+
+ICU Normalization
+-----------------
+
+Normalizes characters as explained [here](http://userguide.icu-project.org/transforms/normalization). It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings. Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`. Here is a sample settings:
+
+```js
+{
+    "index" : {
+        "analysis" : {
+            "analyzer" : {
+                "normalized" : {
+                    "tokenizer" : "keyword",
+                    "filter" : ["icu_normalizer"]
+                }
+            }
+        }
+    }
+}
+```
+
+ICU Folding
+-----------
+
+Folding of unicode characters based on `UTR#30`. It registers itself under `icu_folding` and `icuFolding` names. Sample setting:
+
+```js
+{
+    "index" : {
+        "analysis" : {
+            "analyzer" : {
+                "folded" : {
+                    "tokenizer" : "keyword",
+                    "filter" : ["icu_folding"]
+                }
+            }
+        }
+    }
+}
+```
+
+ICU Filtering
+-------------
+
+The folding can be filtered by a set of unicode characters with the parameter `unicodeSetFilter`. This is useful for a
+non-internationalized search engine where retaining a set of national characters which are primary letters in a specific
+language is wanted. See syntax for the UnicodeSet [here](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html).
+
+The Following example exempts Swedish characters from the folding. Note that the filtered characters are NOT lowercased which is why we add that filter below.
+
+```js
+{
+    "index" : {
+        "analysis" : {
+            "analyzer" : {
+                "folding" : {
+                    "tokenizer" : "standard",
+                    "filter" : ["my_icu_folding", "lowercase"]
+                }
+            }
+            "filter" : {
+                "my_icu_folding" : {
+                    "type" : "icu_folding"
+                    "unicodeSetFilter" : "[^åäöÅÄÖ]"
+                }
+            }
+        }
+    }
+}
+```
+
+ICU Collation
+-------------
+
+Uses collation token filter. Allows to either specify the rules for collation
+(defined [here](http://www.icu-project.org/userguide/Collate_Customization.html)) using the `rules` parameter
+(can point to a location or expressed in the settings, location can be relative to config location), or using the
+`language` parameter (further specialized by country and variant). By default registers under `icu_collation` or
+`icuCollation` and uses the default locale.
+
+Here is a sample settings:
+
+```js
+{
+    "index" : {
+        "analysis" : {
+            "analyzer" : {
+                "collation" : {
+                    "tokenizer" : "keyword",
+                    "filter" : ["icu_collation"]
+                }
+            }
+        }
+    }
+}
+```
+
+And here is a sample of custom collation:
+
+```js
+{
+    "index" : {
+        "analysis" : {
+            "analyzer" : {
+                "collation" : {
+                    "tokenizer" : "keyword",
+                    "filter" : ["myCollator"]
+                }
+            },
+            "filter" : {
+                "myCollator" : {
+                    "type" : "icu_collation",
+                    "language" : "en"
+                }
+            }
+        }
+    }
+}
+```
+
+Optional options:
+* `strength` - The strength property determines the minimum level of difference considered significant during comparison.
+ The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
+ Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
+ See [ICU Collation](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html) documentation for a more detailed
+ explanation for the specific values.
+* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
+`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
+normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
+before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
+faster and more complete collation behavior. Since a great many of the world's languages do not require text
+normalization, most locales set `no` as the default decomposition mode.
+
+Expert options:
+* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
+ to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
+* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
+ strength is set to `primary` this will ignore accent differences.
+* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
+ for strength `tertiary`.
+* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
+ example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
+* `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
+* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
+ and Hiragana characters in `quaternary` strength .
+
+ICU Tokenizer
+-------------
+
+Breaks text into words according to [UAX #29: Unicode Text Segmentation](http://www.unicode.org/reports/tr29/).
+
+```js
+{
+    "index" : {
+        "analysis" : {
+            "analyzer" : {
+                "tokenized" : {
+                    "tokenizer" : "icu_tokenizer",
+                }
+            }
+        }
+    }
+}
+```
+
+
+ICU Normalization CharFilter
+-----------------
+
+Normalizes characters as explained [here](http://userguide.icu-project.org/transforms/normalization).
+It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings.
+Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`.
+Allows for the mode parameter to be provided which can include the following values: `compose` and `decompose`.
+Use `decompose` with `nfc` or `nfkc`, to get `nfd` or `nfkd`, respectively.
+Here is a sample settings:
+
+```js
+{
+    "index" : {
+        "analysis" : {
+            "analyzer" : {
+                "normalized" : {
+                    "tokenizer" : "keyword",
+                    "char_filter" : ["icu_normalizer"]
+                }
+            }
+        }
+    }
+}
+```
+
+ICU Transform
+-------------
+Transforms are used to process Unicode text in many different ways. Some include case mapping, normalization,
+transliteration and bidirectional text handling.
+
+You can defined transliterator identifiers by using `id` property, and specify direction  to `forward` or `reverse` by
+using `dir` property, The default value of both properties are `Null` and `forward`.
+
+For example:
+
+```js
+{
+    "index" : {
+        "analysis" : {
+            "analyzer" : {
+                "latin" : {
+                    "tokenizer" : "keyword",
+                    "filter" : ["myLatinTransform"]
+                }
+            },
+            "filter" : {
+                "myLatinTransform" : {
+                    "type" : "icu_transform",
+                    "id" : "Any-Latin; NFD; [:Nonspacing Mark:] Remove; NFC"
+                }
+            }
+        }
+    }
+}
+```
+
+This transform transliterated characters to latin, and separates accents from their base characters, removes the accents,
+and then puts the remaining text into an unaccented form.
+
+The results are:
+
+`你好` to `ni hao`
+
+`здравствуйте` to `zdravstvujte`
+
+`こんにちは` to `kon'nichiha`
+
+Currently the filter only supports identifier and direction, custom rulesets are not yet supported.
+
+For more documentation, Please see the [user guide of ICU Transform](http://userguide.icu-project.org/transforms/general).
+
+License
+-------
+
+    This software is licensed under the Apache 2 license, quoted below.
+
+    Copyright 2009-2014 Elasticsearch <http://www.elasticsearch.org>
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not
+    use this file except in compliance with the License. You may obtain a copy of
+    the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+    License for the specific language governing permissions and limitations under
+    the License.
--- a/plugins/analysis-icu/pom.xml
+++ b/plugins/analysis-icu/pom.xml
@ -0,0 +1,57 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>org.elasticsearch.plugin</groupId>
+    <artifactId>elasticsearch-analysis-icu</artifactId>
+
+    <packaging>jar</packaging>
+    <name>Elasticsearch ICU Analysis plugin</name>
+    <description>The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components.</description>
+
+    <parent>
+        <groupId>org.elasticsearch</groupId>
+        <artifactId>elasticsearch-plugin</artifactId>
+        <version>2.0.0-SNAPSHOT</version>
+    </parent>
+
+    <properties>
+        <tests.jvms>1</tests.jvms>
+        <es.logger.level>INFO</es.logger.level>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-icu</artifactId>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-assembly-plugin</artifactId>
+            </plugin>
+         </plugins>
+
+         <pluginManagement>
+            <plugins>
+                <plugin>
+                    <groupId>com.mycila</groupId>
+                    <artifactId>license-maven-plugin</artifactId>
+                    <configuration>
+                        <excludes>
+                            <!-- TODO: https://github.com/elastic/elasticsearch-analysis-icu/issues/29 -->
+                            <exclude>**/IndexableBinaryStringTools.java</exclude>
+                            <exclude>**/ICUCollationKeyFilter.java</exclude>
+                            <exclude>**/TestIndexableBinaryStringTools.java</exclude>
+                        </excludes>
+                    </configuration>
+                </plugin>
+            </plugins>
+        </pluginManagement>
+    </build>
+
+</project>
--- a/plugins/analysis-icu/src/main/assemblies/plugin.xml
+++ b/plugins/analysis-icu/src/main/assemblies/plugin.xml
@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<assembly>
+    <id>plugin</id>
+    <formats>
+        <format>zip</format>
+    </formats>
+    <includeBaseDirectory>false</includeBaseDirectory>
+    <dependencySets>
+        <dependencySet>
+            <outputDirectory>/</outputDirectory>
+            <useProjectArtifact>true</useProjectArtifact>
+            <useTransitiveFiltering>true</useTransitiveFiltering>
+            <excludes>
+                <exclude>org.elasticsearch:elasticsearch</exclude>
+            </excludes>
+        </dependencySet>
+        <dependencySet>
+            <outputDirectory>/</outputDirectory>
+            <useProjectArtifact>true</useProjectArtifact>
+            <useTransitiveFiltering>true</useTransitiveFiltering>
+            <includes>
+                <include>org.apache.lucene:lucene-analyzers-icu</include>
+            </includes>
+        </dependencySet>
+    </dependencySets>
+</assembly>
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/ICUCollationKeyFilter.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/ICUCollationKeyFilter.java
@ -0,0 +1,109 @@
+package org.elasticsearch.index.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RawCollationKey;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+import java.io.IOException;
+
+/**
+ * <p>
+ *   Converts each token into its {@link com.ibm.icu.text.CollationKey}, and
+ *   then encodes the CollationKey with {@link IndexableBinaryStringTools}, to
+ *   allow it to be stored as an index term.
+ * </p>
+ * <p>
+ *   <strong>WARNING:</strong> Make sure you use exactly the same Collator at
+ *   index and query time -- CollationKeys are only comparable when produced by
+ *   the same Collator.  {@link com.ibm.icu.text.RuleBasedCollator}s are 
+ *   independently versioned, so it is safe to search against stored
+ *   CollationKeys if the following are exactly the same (best practice is
+ *   to store this information with the index and check that they remain the
+ *   same at query time):
+ * </p>
+ * <ol>
+ *   <li>
+ *     Collator version - see {@link Collator#getVersion()}
+ *   </li>
+ *   <li>
+ *     The collation strength used - see {@link Collator#setStrength(int)}
+ *   </li>
+ * </ol> 
+ * <p>
+ *   CollationKeys generated by ICU Collators are not compatible with those
+ *   generated by java.text.Collators.  Specifically, if you use 
+ *   ICUCollationKeyFilter to generate index terms, do not use 
+ *   {@code CollationKeyFilter} on the query side, or vice versa.
+ * </p>
+ * <p>
+ *   ICUCollationKeyFilter is significantly faster and generates significantly
+ *   shorter keys than CollationKeyFilter.  See
+ *   <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
+ *   >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
+ *   generation timing and key length comparisons between ICU4J and
+ *   java.text.Collator over several languages.
+ * </p>
+ *  @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes
+ *  terms directly as bytes. This filter WAS removed in Lucene 5.0
+ */
+@Deprecated
+public final class ICUCollationKeyFilter extends TokenFilter {
+  private Collator collator = null;
+  private RawCollationKey reusableKey = new RawCollationKey();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  /**
+   * 
+   * @param input Source token stream
+   * @param collator CollationKey generator
+   */
+  public ICUCollationKeyFilter(TokenStream input, Collator collator) {
+    super(input);
+    // clone the collator: see http://userguide.icu-project.org/collation/architecture
+    try {
+      this.collator = (Collator) collator.clone();
+    } catch (CloneNotSupportedException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      char[] termBuffer = termAtt.buffer();
+      String termText = new String(termBuffer, 0, termAtt.length());
+      collator.getRawCollationKey(termText, reusableKey);
+      int encodedLength = IndexableBinaryStringTools.getEncodedLength(
+          reusableKey.bytes, 0, reusableKey.size);
+      if (encodedLength > termBuffer.length) {
+        termAtt.resizeBuffer(encodedLength);
+      }
+      termAtt.setLength(encodedLength);
+      IndexableBinaryStringTools.encode(reusableKey.bytes, 0, reusableKey.size,
+          termAtt.buffer(), 0, encodedLength);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java
@ -0,0 +1,43 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+/**
+ */
+public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
+
+    @Override
+    public void processCharFilters(CharFiltersBindings charFiltersBindings) {
+        charFiltersBindings.processCharFilter("icu_normalizer", IcuNormalizerCharFilterFactory.class);
+    }
+
+    @Override
+    public void processTokenizers(TokenizersBindings tokenizersBindings) {
+        tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class);
+    }
+
+    @Override
+    public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
+        tokenFiltersBindings.processTokenFilter("icu_normalizer", IcuNormalizerTokenFilterFactory.class);
+        tokenFiltersBindings.processTokenFilter("icu_folding", IcuFoldingTokenFilterFactory.class);
+        tokenFiltersBindings.processTokenFilter("icu_collation", IcuCollationTokenFilterFactory.class);
+        tokenFiltersBindings.processTokenFilter("icu_transform", IcuTransformTokenFilterFactory.class);
+    }
+}
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java
@ -0,0 +1,178 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+import org.apache.lucene.analysis.TokenStream;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.io.Streams;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.FailedToResolveConfigException;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+
+/**
+ * An ICU based collation token filter. There are two ways to configure collation:
+ * <p/>
+ * <p>The first is simply specifying the locale (defaults to the default locale). The <tt>language</tt>
+ * parameter is the lowercase two-letter ISO-639 code. An additional <tt>country</tt> and <tt>variant</tt>
+ * can be provided.
+ * <p/>
+ * <p>The second option is to specify collation rules as defined in the <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
+ * Collation customization</a> chapter in icu docs. The <tt>rules</tt> parameter can either embed the rules definition
+ * in the settings or refer to an external location (preferable located under the <tt>config</tt> location, relative to it).
+ */
+public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
+
+    private final Collator collator;
+
+    @Inject
+    public IcuCollationTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment environment, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+
+        Collator collator;
+        String rules = settings.get("rules");
+        if (rules != null) {
+            Exception failureToResolve = null;
+            try {
+                rules = Streams.copyToString(Files.newBufferedReader(environment.configFile().resolve(rules), Charset.forName("UTF-8")));
+            } catch (FailedToResolveConfigException | IOException | SecurityException e) {
+                failureToResolve = e;
+            }
+            try {
+                collator = new RuleBasedCollator(rules);
+            } catch (Exception e) {
+                if (failureToResolve != null) {
+                    throw new IllegalArgumentException("Failed to resolve collation rules location", failureToResolve);
+                } else {
+                    throw new IllegalArgumentException("Failed to parse collation rules", e);
+                }
+            }
+        } else {
+            String language = settings.get("language");
+            if (language != null) {
+                ULocale locale;
+                String country = settings.get("country");
+                if (country != null) {
+                    String variant = settings.get("variant");
+                    if (variant != null) {
+                        locale = new ULocale(language, country, variant);
+                    } else {
+                        locale = new ULocale(language, country);
+                    }
+                } else {
+                    locale = new ULocale(language);
+                }
+                collator = Collator.getInstance(locale);
+            } else {
+                collator = Collator.getInstance();
+            }
+        }
+
+        // set the strength flag, otherwise it will be the default.
+        String strength = settings.get("strength");
+        if (strength != null) {
+            if (strength.equalsIgnoreCase("primary")) {
+                collator.setStrength(Collator.PRIMARY);
+            } else if (strength.equalsIgnoreCase("secondary")) {
+                collator.setStrength(Collator.SECONDARY);
+            } else if (strength.equalsIgnoreCase("tertiary")) {
+                collator.setStrength(Collator.TERTIARY);
+            } else if (strength.equalsIgnoreCase("quaternary")) {
+                collator.setStrength(Collator.QUATERNARY);
+            } else if (strength.equalsIgnoreCase("identical")) {
+                collator.setStrength(Collator.IDENTICAL);
+            } else {
+                throw new IllegalArgumentException("Invalid strength: " + strength);
+            }
+        }
+
+        // set the decomposition flag, otherwise it will be the default.
+        String decomposition = settings.get("decomposition");
+        if (decomposition != null) {
+            if (decomposition.equalsIgnoreCase("no")) {
+                collator.setDecomposition(Collator.NO_DECOMPOSITION);
+            } else if (decomposition.equalsIgnoreCase("canonical")) {
+                collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
+            } else {
+                throw new IllegalArgumentException("Invalid decomposition: " + decomposition);
+            }
+        }
+
+        // expert options: concrete subclasses are always a RuleBasedCollator
+        RuleBasedCollator rbc = (RuleBasedCollator) collator;
+        String alternate = settings.get("alternate");
+        if (alternate != null) {
+            if (alternate.equalsIgnoreCase("shifted")) {
+                rbc.setAlternateHandlingShifted(true);
+            } else if (alternate.equalsIgnoreCase("non-ignorable")) {
+                rbc.setAlternateHandlingShifted(false);
+            } else {
+                throw new IllegalArgumentException("Invalid alternate: " + alternate);
+            }
+        }
+
+        Boolean caseLevel = settings.getAsBoolean("caseLevel", null);
+        if (caseLevel != null) {
+            rbc.setCaseLevel(caseLevel);
+        }
+
+        String caseFirst = settings.get("caseFirst");
+        if (caseFirst != null) {
+            if (caseFirst.equalsIgnoreCase("lower")) {
+                rbc.setLowerCaseFirst(true);
+            } else if (caseFirst.equalsIgnoreCase("upper")) {
+                rbc.setUpperCaseFirst(true);
+            } else {
+                throw new IllegalArgumentException("Invalid caseFirst: " + caseFirst);
+            }
+        }
+
+        Boolean numeric = settings.getAsBoolean("numeric", null);
+        if (numeric != null) {
+            rbc.setNumericCollation(numeric);
+        }
+
+        String variableTop = settings.get("variableTop");
+        if (variableTop != null) {
+            rbc.setVariableTop(variableTop);
+        }
+
+        Boolean hiraganaQuaternaryMode = settings.getAsBoolean("hiraganaQuaternaryMode", null);
+        if (hiraganaQuaternaryMode != null) {
+            rbc.setHiraganaQuaternary(hiraganaQuaternaryMode);
+        }
+
+        this.collator = collator;
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new ICUCollationKeyFilter(tokenStream, collator);
+    }
+}
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java
@ -0,0 +1,72 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.icu.ICUFoldingFilter;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import com.ibm.icu.text.FilteredNormalizer2;
+import com.ibm.icu.text.Normalizer2;
+import com.ibm.icu.text.UnicodeSet;
+
+
+/**
+ * Uses the {@link org.apache.lucene.analysis.icu.ICUFoldingFilter}. 
+ * Applies foldings from UTR#30 Character Foldings.
+ * <p>
+ * Can be filtered to handle certain characters in a specified way (see http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html)
+ * E.g national chars that should be retained (filter : "[^åäöÅÄÖ]").
+ *
+ * <p>The <tt>unicodeSetFilter</tt> attribute can be used to provide the UniCodeSet for filtering.
+ *
+ * @author kimchy (shay.banon)
+ */
+public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory {
+    private final String unicodeSetFilter;
+
+    @Inject public IcuFoldingTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+        this.unicodeSetFilter = settings.get("unicodeSetFilter");
+    }
+
+    @Override public TokenStream create(TokenStream tokenStream) {
+
+        // The ICUFoldingFilter is in fact implemented as a ICUNormalizer2Filter.
+        // ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here
+        if (unicodeSetFilter != null) { 
+            Normalizer2 base = Normalizer2.getInstance(
+                    ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
+                    "utr30", Normalizer2.Mode.COMPOSE);
+            UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);
+
+            unicodeSet.freeze();
+            Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet);
+            return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered); 
+        }
+        else {
+            return new ICUFoldingFilter(tokenStream);
+        }
+    }
+}
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java
@ -0,0 +1,63 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+
+import com.ibm.icu.text.Normalizer2;
+import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import java.io.Reader;
+
+
+/**
+ * Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter} to normalize character.
+ * <p/>
+ * <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p>
+ * <p>The <tt>mode</tt> can be used to provide 'compose' or 'decompose'. Default is compose.</p>
+ */
+public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory {
+
+    private final String name;
+
+    private final Normalizer2 normalizer;
+
+
+    @Inject
+    public IcuNormalizerCharFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name);
+        this.name = settings.get("name", "nfkc_cf");
+        String mode = settings.get("mode");
+        if (!"compose".equals(mode) && !"decompose".equals(mode)) {
+            mode = "compose";
+        }
+        this.normalizer = Normalizer2.getInstance(
+            null, this.name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
+    }
+
+    @Override
+    public Reader create(Reader reader) {
+        return new ICUNormalizer2CharFilter(reader, normalizer);
+    }
+}
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java
@ -0,0 +1,52 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import com.ibm.icu.text.Normalizer2;
+import org.apache.lucene.analysis.TokenStream;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+
+/**
+ * Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to normalize tokens.
+ * <p/>
+ * <p>The <tt>name</tt> can be used to provide the type of normalization to perform.
+ *
+ *
+ */
+public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory {
+
+    private final String name;
+
+    @Inject
+    public IcuNormalizerTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+        this.name = settings.get("name", "nfkc_cf");
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE));
+    }
+}
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java
@ -0,0 +1,46 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import java.io.Reader;
+
+/**
+ */
+public class IcuTokenizerFactory extends AbstractTokenizerFactory {
+
+    @Inject
+    public IcuTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+    }
+
+    @Override
+    public Tokenizer create() {
+        return new ICUTokenizer();
+    }
+
+}
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java
@ -0,0 +1,53 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import com.ibm.icu.text.Transliterator;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.icu.ICUTransformFilter;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+
+/**
+ */
+public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory {
+
+    private final String id;
+    private final int dir;
+    private final Transliterator transliterator;
+
+    @Inject
+    public IcuTransformTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+        this.id = settings.get("id", "Null");
+        String s = settings.get("dir", "forward");
+        this.dir = "forward".equals(s) ? Transliterator.FORWARD : Transliterator.REVERSE;
+        this.transliterator = Transliterator.getInstance(id, dir);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new ICUTransformFilter(tokenStream, transliterator);
+    }
+}
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IndexableBinaryStringTools.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IndexableBinaryStringTools.java
@ -0,0 +1,241 @@
+package org.elasticsearch.index.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadoc
+
+/**
+ * Provides support for converting byte sequences to Strings and back again.
+ * The resulting Strings preserve the original byte sequences' sort order.
+ * <p/>
+ * The Strings are constructed using a Base 8000h encoding of the original
+ * binary data - each char of an encoded String represents a 15-bit chunk
+ * from the byte sequence.  Base 8000h was chosen because it allows for all
+ * lower 15 bits of char to be used without restriction; the surrogate range 
+ * [U+D8000-U+DFFF] does not represent valid chars, and would require
+ * complicated handling to avoid them and allow use of char's high bit.
+ * <p/>
+ * Although unset bits are used as padding in the final char, the original
+ * byte sequence could contain trailing bytes with no set bits (null bytes):
+ * padding is indistinguishable from valid information.  To overcome this
+ * problem, a char is appended, indicating the number of encoded bytes in the
+ * final content char.
+ * <p/>
+ *
+ * @lucene.experimental
+ * @deprecated Implement {@link TermToBytesRefAttribute} and store bytes directly
+ * instead. This class WAS removed in Lucene 5.0
+ */
+@Deprecated
+public final class IndexableBinaryStringTools {
+
+  private static final CodingCase[] CODING_CASES = {
+    // CodingCase(int initialShift, int finalShift)
+    new CodingCase( 7, 1   ),
+    // CodingCase(int initialShift, int middleShift, int finalShift)
+    new CodingCase(14, 6, 2),
+    new CodingCase(13, 5, 3),
+    new CodingCase(12, 4, 4),
+    new CodingCase(11, 3, 5),
+    new CodingCase(10, 2, 6),
+    new CodingCase( 9, 1, 7),
+    new CodingCase( 8, 0   )
+  };
+
+  // Export only static methods
+  private IndexableBinaryStringTools() {}
+
+  /**
+   * Returns the number of chars required to encode the given bytes.
+   * 
+   * @param inputArray byte sequence to be encoded
+   * @param inputOffset initial offset into inputArray
+   * @param inputLength number of bytes in inputArray
+   * @return The number of chars required to encode the number of bytes.
+   */
+  public static int getEncodedLength(byte[] inputArray, int inputOffset,
+      int inputLength) {
+    // Use long for intermediaries to protect against overflow
+    return (int)((8L * inputLength + 14L) / 15L) + 1;
+  }
+
+  /**
+   * Returns the number of bytes required to decode the given char sequence.
+   * 
+   * @param encoded char sequence to be decoded
+   * @param offset initial offset
+   * @param length number of characters
+   * @return The number of bytes required to decode the given char sequence
+   */
+  public static int getDecodedLength(char[] encoded, int offset, int length) {
+    final int numChars = length - 1;
+    if (numChars <= 0) {
+      return 0;
+    } else {
+      // Use long for intermediaries to protect against overflow
+      final long numFullBytesInFinalChar = encoded[offset + length - 1];
+      final long numEncodedChars = numChars - 1;
+      return (int)((numEncodedChars * 15L + 7L) / 8L + numFullBytesInFinalChar);
+    }
+  }
+
+  /**
+   * Encodes the input byte sequence into the output char sequence.  Before
+   * calling this method, ensure that the output array has sufficient
+   * capacity by calling {@link #getEncodedLength(byte[], int, int)}.
+   * 
+   * @param inputArray byte sequence to be encoded
+   * @param inputOffset initial offset into inputArray
+   * @param inputLength number of bytes in inputArray
+   * @param outputArray char sequence to store encoded result
+   * @param outputOffset initial offset into outputArray
+   * @param outputLength length of output, must be getEncodedLength
+   */
+  public static void encode(byte[] inputArray, int inputOffset,
+      int inputLength, char[] outputArray, int outputOffset, int outputLength) {
+    assert (outputLength == getEncodedLength(inputArray, inputOffset,
+        inputLength));
+    if (inputLength > 0) {
+      int inputByteNum = inputOffset;
+      int caseNum = 0;
+      int outputCharNum = outputOffset;
+      CodingCase codingCase;
+      for (; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength; ++outputCharNum) {
+        codingCase = CODING_CASES[caseNum];
+        if (2 == codingCase.numBytes) {
+          outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
+              + (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
+        } else { // numBytes is 3
+          outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
+              + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)
+              + (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
+        }
+        inputByteNum += codingCase.advanceBytes;
+        if (++caseNum == CODING_CASES.length) {
+          caseNum = 0;
+        }
+      }
+      // Produce final char (if any) and trailing count chars.
+      codingCase = CODING_CASES[caseNum];
+
+      if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3
+        outputArray[outputCharNum++] = (char) ((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) & (short) 0x7FFF);
+        // Add trailing char containing the number of full bytes in final char
+        outputArray[outputCharNum++] = (char) 1;
+      } else if (inputByteNum < inputLength) {
+        outputArray[outputCharNum++] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) & (short) 0x7FFF);
+        // Add trailing char containing the number of full bytes in final char
+        outputArray[outputCharNum++] = caseNum == 0 ? (char) 1 : (char) 0;
+      } else { // No left over bits - last char is completely filled.
+        // Add trailing char containing the number of full bytes in final char
+        outputArray[outputCharNum++] = (char) 1;
+      }
+    }
+  }
+
+  /**
+   * Decodes the input char sequence into the output byte sequence. Before
+   * calling this method, ensure that the output array has sufficient capacity
+   * by calling {@link #getDecodedLength(char[], int, int)}.
+   * 
+   * @param inputArray char sequence to be decoded
+   * @param inputOffset initial offset into inputArray
+   * @param inputLength number of chars in inputArray
+   * @param outputArray byte sequence to store encoded result
+   * @param outputOffset initial offset into outputArray
+   * @param outputLength length of output, must be
+   *        getDecodedLength(inputArray, inputOffset, inputLength)
+   */
+  public static void decode(char[] inputArray, int inputOffset,
+      int inputLength, byte[] outputArray, int outputOffset, int outputLength) {
+    assert (outputLength == getDecodedLength(inputArray, inputOffset,
+        inputLength));
+    final int numInputChars = inputLength - 1;
+    final int numOutputBytes = outputLength;
+
+    if (numOutputBytes > 0) {
+      int caseNum = 0;
+      int outputByteNum = outputOffset;
+      int inputCharNum = inputOffset;
+      short inputChar;
+      CodingCase codingCase;
+      for (; inputCharNum < numInputChars - 1; ++inputCharNum) {
+        codingCase = CODING_CASES[caseNum];
+        inputChar = (short) inputArray[inputCharNum];
+        if (2 == codingCase.numBytes) {
+          if (0 == caseNum) {
+            outputArray[outputByteNum] = (byte) (inputChar >>> codingCase.initialShift);
+          } else {
+            outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
+          }
+          outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
+        } else { // numBytes is 3
+          outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
+          outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
+          outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
+        }
+        outputByteNum += codingCase.advanceBytes;
+        if (++caseNum == CODING_CASES.length) {
+          caseNum = 0;
+        }
+      }
+      // Handle final char
+      inputChar = (short) inputArray[inputCharNum];
+      codingCase = CODING_CASES[caseNum];
+      if (0 == caseNum) {
+        outputArray[outputByteNum] = 0;
+      }
+      outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
+      final int bytesLeft = numOutputBytes - outputByteNum;
+      if (bytesLeft > 1) {
+        if (2 == codingCase.numBytes) {
+          outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) >>> codingCase.finalShift);
+        } else { // numBytes is 3
+          outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
+          if (bytesLeft > 2) {
+            outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
+          }
+        }
+      }
+    }
+  }
+
+  static class CodingCase {
+    int numBytes, initialShift, middleShift, finalShift, advanceBytes = 2;
+    short middleMask, finalMask;
+
+    CodingCase(int initialShift, int middleShift, int finalShift) {
+      this.numBytes = 3;
+      this.initialShift = initialShift;
+      this.middleShift = middleShift;
+      this.finalShift = finalShift;
+      this.finalMask = (short)((short)0xFF >>> finalShift);
+      this.middleMask = (short)((short)0xFF << middleShift);
+    }
+
+    CodingCase(int initialShift, int finalShift) {
+      this.numBytes = 2;
+      this.initialShift = initialShift;
+      this.finalShift = finalShift;
+      this.finalMask = (short)((short)0xFF >>> finalShift);
+      if (finalShift != 0) {
+        advanceBytes = 1; 
+      }
+    }
+  }
+}
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysis.java
@ -0,0 +1,110 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.indices.analysis;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.Normalizer2;
+import com.ibm.icu.text.Transliterator;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.icu.ICUFoldingFilter;
+import org.apache.lucene.analysis.icu.ICUTransformFilter;
+import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
+import org.elasticsearch.common.component.AbstractComponent;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.analysis.ICUCollationKeyFilter;
+import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory;
+import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;
+
+/**
+ * Registers indices level analysis components so, if not explicitly configured, will be shared
+ * among all indices.
+ */
+public class IcuIndicesAnalysis extends AbstractComponent {
+
+    @Inject
+    public IcuIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) {
+        super(settings);
+
+        indicesAnalysisService.tokenizerFactories().put("icu_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
+            @Override
+            public String name() {
+                return "icu_tokenizer";
+            }
+
+            @Override
+            public Tokenizer create() {
+                return new ICUTokenizer();
+            }
+        }));
+
+        indicesAnalysisService.tokenFilterFactories().put("icu_normalizer", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return "icu_normalizer";
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
+            }
+        }));
+
+
+        indicesAnalysisService.tokenFilterFactories().put("icu_folding", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return "icu_folding";
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                return new ICUFoldingFilter(tokenStream);
+            }
+        }));
+
+        indicesAnalysisService.tokenFilterFactories().put("icu_collation", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return "icu_collation";
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                return new ICUCollationKeyFilter(tokenStream, Collator.getInstance());
+            }
+        }));
+
+        indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return "icu_transform";
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                return new ICUTransformFilter(tokenStream, Transliterator.getInstance("Null", Transliterator.FORWARD));
+            }
+        }));
+    }
+}
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/indices/analysis/IcuIndicesAnalysisModule.java
@ -0,0 +1,32 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.indices.analysis;
+
+import org.elasticsearch.common.inject.AbstractModule;
+
+/**
+ */
+public class IcuIndicesAnalysisModule extends AbstractModule {
+
+    @Override
+    protected void configure() {
+        bind(IcuIndicesAnalysis.class).asEagerSingleton();
+    }
+}
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java
@ -0,0 +1,59 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.plugin.analysis.icu;
+
+import org.elasticsearch.common.inject.Module;
+import org.elasticsearch.index.analysis.AnalysisModule;
+import org.elasticsearch.index.analysis.IcuAnalysisBinderProcessor;
+import org.elasticsearch.indices.analysis.IcuIndicesAnalysisModule;
+import org.elasticsearch.plugins.AbstractPlugin;
+
+import java.util.ArrayList;
+import java.util.Collection;
+
+/**
+ *
+ */
+public class AnalysisICUPlugin extends AbstractPlugin {
+
+    @Override
+    public String name() {
+        return "analysis-icu";
+    }
+
+    @Override
+    public String description() {
+        return "UTF related ICU analysis support";
+    }
+
+    @Override
+    public Collection<Class<? extends Module>> modules() {
+        Collection<Class<? extends Module>> classes = new ArrayList<>();
+        classes.add(IcuIndicesAnalysisModule.class);
+        return classes;
+    }
+
+    /**
+     * Automatically called with the analysis module.
+     */
+    public void onModule(AnalysisModule module) {
+        module.addProcessor(new IcuAnalysisBinderProcessor());
+    }
+}
--- a/plugins/analysis-icu/src/main/resources/es-plugin.properties
+++ b/plugins/analysis-icu/src/main/resources/es-plugin.properties
@ -0,0 +1,3 @@
+plugin=org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin
+version=${project.version}
+lucene=${lucene.version}
--- a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java
+++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisTestUtils.java
@ -0,0 +1,54 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.elasticsearch.Version;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
+import org.elasticsearch.common.inject.Injector;
+import org.elasticsearch.common.inject.ModulesBuilder;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.settings.SettingsModule;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.EnvironmentModule;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.IndexNameModule;
+import org.elasticsearch.index.settings.IndexSettingsModule;
+import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
+import org.elasticsearch.indices.analysis.IndicesAnalysisService;
+
+import static org.elasticsearch.common.settings.Settings.settingsBuilder;
+
+public class AnalysisTestUtils {
+
+    public static AnalysisService createAnalysisService(Settings settings) {
+        Index index = new Index("test");
+        Settings indexSettings = settingsBuilder().put(settings)
+                .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+                .build();
+        Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
+        Injector injector = new ModulesBuilder().add(
+                new IndexSettingsModule(index, indexSettings),
+                new IndexNameModule(index),
+                new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor()))
+                .createChildInjector(parentInjector);
+
+        return injector.getInstance(AnalysisService.class);
+    }
+}
--- a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java
+++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/ICUIntegrationTests.java
@ -0,0 +1,119 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.analysis;
+
+import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse;
+import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.plugins.PluginsService;
+import org.elasticsearch.test.ElasticsearchIntegrationTest;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.concurrent.ExecutionException;
+
+import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
+import static org.hamcrest.CoreMatchers.is;
+import static org.hamcrest.CoreMatchers.notNullValue;
+
+@ElasticsearchIntegrationTest.ClusterScope(scope = ElasticsearchIntegrationTest.Scope.SUITE)
+public class ICUIntegrationTests extends ElasticsearchIntegrationTest {
+
+    @Override
+    protected Settings nodeSettings(int nodeOrdinal) {
+        return Settings.builder()
+                .put(super.nodeSettings(nodeOrdinal))
+                .put("plugins." + PluginsService.LOAD_PLUGIN_FROM_CLASSPATH, true)
+                .build();
+    }
+
+    @Override
+    public Settings indexSettings() {
+        Settings settings = Settings.builder()
+                .put(super.indexSettings())
+                .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
+                .putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_collator")
+                .put("index.analysis.filter.my_collator.type", "icu_collation")
+                .put("index.analysis.filter.my_collator.language", "en")
+                .put("index.analysis.filter.my_collator.strength", "primary")
+                .build();
+
+        return settings;
+    }
+
+    @Test
+    public void testICUAnalyzer() throws ExecutionException, InterruptedException {
+        createIndex("test");
+        ensureGreen("test");
+        AnalyzeResponse response1 = client().admin().indices()
+                .prepareAnalyze("Bâton enflammé")
+                .setIndex("test")
+                .setAnalyzer("my_analyzer")
+                .execute().get();
+        AnalyzeResponse response2 = client().admin().indices()
+                .prepareAnalyze("baton enflamme")
+                .setIndex("test")
+                .setAnalyzer("my_analyzer")
+                .execute().get();
+
+        assertThat(response1, notNullValue());
+        assertThat(response2, notNullValue());
+        assertThat(response1.getTokens().size(), is(response2.getTokens().size()));
+
+        for (int i = 0; i < response2.getTokens().size(); i++) {
+            assertThat(response1.getTokens().get(i).getTerm(), is(response2.getTokens().get(i).getTerm()));
+        }
+    }
+
+    @Test
+    public void testICUAnalyzerInMapping() throws ExecutionException, InterruptedException, IOException {
+        createIndex("test");
+        ensureGreen("test");
+        final XContentBuilder mapping = jsonBuilder().startObject()
+                .startObject("type")
+                .startObject("properties")
+                .startObject("foo")
+                .field("type", "string")
+                .field("analyzer", "my_analyzer")
+                .endObject()
+                .endObject()
+                .endObject()
+                .endObject();
+
+        client().admin().indices().preparePutMapping("test").setType("type").setSource(mapping).get();
+
+        index("test", "type", "1", "foo", "Bâton enflammé");
+        refresh();
+
+        SearchResponse response = client().prepareSearch("test").setQuery(
+                QueryBuilders.matchQuery("foo", "baton enflamme")
+        ).execute().actionGet();
+
+        assertThat(response.getHits().getTotalHits(), is(1L));
+    }
+
+    @Test
+    public void testPluginIsLoaded() {
+        NodesInfoResponse infos = client().admin().cluster().prepareNodesInfo().setPlugins(true).execute().actionGet();
+        assertThat(infos.getNodes()[0].getPlugins().getInfos().get(0).getName(), is("analysis-icu"));
+    }
+}
--- a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java
+++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java
@ -0,0 +1,58 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.test.ElasticsearchTestCase;
+import org.junit.Test;
+
+import static org.elasticsearch.common.settings.Settings.settingsBuilder;
+import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService;
+import static org.hamcrest.Matchers.instanceOf;
+/**
+ */
+public class SimpleIcuAnalysisTests extends ElasticsearchTestCase {
+
+    @Test
+    public void testDefaultsIcuAnalysis() {
+        Settings settings = settingsBuilder()
+                .put("path.home", createTempDir())
+                .loadFromClasspath("org/elasticsearch/index/analysis/phonetic-1.yml").build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer");
+        assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer");
+        assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
+
+        filterFactory = analysisService.tokenFilter("icu_folding");
+        assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
+
+        filterFactory = analysisService.tokenFilter("icu_collation");
+        assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));
+
+        filterFactory = analysisService.tokenFilter("icu_transform");
+        assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
+
+        CharFilterFactory charFilterFactory = analysisService.charFilter("icu_normalizer");
+        assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class));
+    }
+}
--- a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java
+++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/SimpleIcuCollationTokenFilterTests.java
@ -0,0 +1,263 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.test.ElasticsearchTestCase;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService;
+import static org.hamcrest.Matchers.equalTo;
+
+// Tests borrowed from Solr's Icu collation key filter factory test.
+public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase {
+
+    /*
+    * Turkish has some funny casing.
+    * This test shows how you can solve this kind of thing easily with collation.
+    * Instead of using LowerCaseFilter, use a turkish collator with primary strength.
+    * Then things will sort and match correctly.
+    */
+    @Test
+    public void testBasicUsage() throws Exception {
+        Settings settings = Settings.settingsBuilder()
+                .put("path.home", createTempDir())
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "tr")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "I WİLL USE TURKİSH CASING", "ı will use turkish casıng");
+    }
+
+    /*
+    * Test usage of the decomposition option for unicode normalization.
+    */
+    @Test
+    public void testNormalization() throws IOException {
+        Settings settings = Settings.settingsBuilder()
+                .put("path.home", createTempDir())
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "tr")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .put("index.analysis.filter.myCollator.decomposition", "canonical")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "I W\u0049\u0307LL USE TURKİSH CASING", "ı will use turkish casıng");
+    }
+
+    /*
+    * Test secondary strength, for english case is not significant.
+    */
+    @Test
+    public void testSecondaryStrength() throws IOException {
+        Settings settings = Settings.settingsBuilder()
+                .put("path.home", createTempDir())
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "secondary")
+                .put("index.analysis.filter.myCollator.decomposition", "no")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "TESTING", "testing");
+    }
+
+    /*
+    * Setting alternate=shifted to shift whitespace, punctuation and symbols
+    * to quaternary level
+    */
+    @Test
+    public void testIgnorePunctuation() throws IOException {
+        Settings settings = Settings.settingsBuilder()
+                .put("path.home", createTempDir())
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .put("index.analysis.filter.myCollator.alternate", "shifted")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "foo-bar", "foo bar");
+    }
+
+    /*
+    * Setting alternate=shifted and variableTop to shift whitespace, but not
+    * punctuation or symbols, to quaternary level
+    */
+    @Test
+    public void testIgnoreWhitespace() throws IOException {
+        Settings settings = Settings.settingsBuilder()
+                .put("path.home", createTempDir())
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .put("index.analysis.filter.myCollator.alternate", "shifted")
+                .put("index.analysis.filter.myCollator.variableTop", " ")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "foo bar", "foobar");
+        // now assert that punctuation still matters: foo-bar < foo bar
+        assertCollation(filterFactory, "foo-bar", "foo bar", -1);
+    }
+
+    /*
+    * Setting numeric to encode digits with numeric value, so that
+    * foobar-9 sorts before foobar-10
+    */
+    @Test
+    public void testNumerics() throws IOException {
+        Settings settings = Settings.settingsBuilder()
+                .put("path.home", createTempDir())
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.numeric", "true")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollation(filterFactory, "foobar-9", "foobar-10", -1);
+    }
+
+    /*
+    * Setting caseLevel=true to create an additional case level between
+    * secondary and tertiary
+    */
+    @Test
+    public void testIgnoreAccentsButNotCase() throws IOException {
+        Settings settings = Settings.settingsBuilder()
+                .put("path.home", createTempDir())
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .put("index.analysis.filter.myCollator.caseLevel", "true")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "résumé", "resume");
+        assertCollatesToSame(filterFactory, "Résumé", "Resume");
+        // now assert that case still matters: resume < Resume
+        assertCollation(filterFactory, "resume", "Resume", -1);
+    }
+
+    /*
+    * Setting caseFirst=upper to cause uppercase strings to sort
+    * before lowercase ones.
+    */
+    @Test
+    public void testUpperCaseFirst() throws IOException {
+        Settings settings = Settings.settingsBuilder()
+                .put("path.home", createTempDir())
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.language", "en")
+                .put("index.analysis.filter.myCollator.strength", "tertiary")
+                .put("index.analysis.filter.myCollator.caseFirst", "upper")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollation(filterFactory, "Resume", "resume", -1);
+    }
+
+    /*
+    * For german, you might want oe to sort and match with o umlaut.
+    * This is not the default, but you can make a customized ruleset to do this.
+    *
+    * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
+    *  http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
+    */
+    @Test
+    public void testCustomRules() throws Exception {
+        RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
+        String DIN5007_2_tailorings =
+                "& ae , a\u0308 & AE , A\u0308"+
+                        "& oe , o\u0308 & OE , O\u0308"+
+                        "& ue , u\u0308 & UE , u\u0308";
+
+        RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
+        String tailoredRules = tailoredCollator.getRules();
+
+        Settings settings = Settings.settingsBuilder()
+                .put("path.home", createTempDir())
+                .put("index.analysis.filter.myCollator.type", "icu_collation")
+                .put("index.analysis.filter.myCollator.rules", tailoredRules)
+                .put("index.analysis.filter.myCollator.strength", "primary")
+                .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+
+        TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
+        assertCollatesToSame(filterFactory, "Töne", "Toene");
+    }
+    
+    private void assertCollatesToSame(TokenFilterFactory factory, String string1, String string2) throws IOException {
+        assertCollation(factory, string1, string2, 0);
+    }
+    
+    private void assertCollation(TokenFilterFactory factory, String string1, String string2, int comparison) throws IOException {
+        Tokenizer tokenizer = new KeywordTokenizer();
+        tokenizer.setReader(new StringReader(string1));
+        TokenStream stream1 = factory.create(tokenizer);
+    
+        tokenizer = new KeywordTokenizer();
+        tokenizer.setReader(new StringReader(string2));
+        TokenStream stream2 = factory.create(tokenizer);
+      
+        assertCollation(stream1, stream2, comparison);
+    }
+
+    private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
+        CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
+        CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);
+
+        stream1.reset();
+        stream2.reset();
+
+        assertThat(stream1.incrementToken(), equalTo(true));
+        assertThat(stream2.incrementToken(), equalTo(true));
+        assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison)));
+        assertThat(stream1.incrementToken(), equalTo(false));
+        assertThat(stream2.incrementToken(), equalTo(false));
+        
+        stream1.end();
+        stream2.end();
+        
+        stream1.close();
+        stream2.close();
+    }
+}
--- a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java
+++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java
@ -0,0 +1,89 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import com.ibm.icu.text.Normalizer2;
+import org.apache.lucene.analysis.CharFilter;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.test.ElasticsearchTestCase;
+import org.junit.Test;
+
+import java.io.StringReader;
+
+import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService;
+
+/**
+ * Test
+ */
+public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase {
+
+    @Test
+    public void testDefaultSetting() throws Exception {
+
+        Settings settings = Settings.settingsBuilder()
+            .put("path.home", createTempDir())
+            .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
+            .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+        CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
+
+        String input = "ʰ㌰゙5℃№㈱㌘，バッファーの正規化のテスト．㋐㋑㋒㋓㋔ｶｷｸｹｺｻﾞｼﾞｽﾞｾﾞｿﾞg̈각/각நிเกषिchkʷक्षि";
+        Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
+        String expectedOutput = normalizer.normalize(input);
+        CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
+        char[] tempBuff = new char[10];
+        StringBuilder output = new StringBuilder();
+        while (true) {
+            int length = inputReader.read(tempBuff);
+            if (length == -1) break;
+            output.append(tempBuff, 0, length);
+            assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
+        }
+        assertEquals(expectedOutput, output.toString());
+    }
+
+
+    @Test
+    public void testNameAndModeSetting() throws Exception {
+
+        Settings settings = Settings.settingsBuilder()
+            .put("path.home", createTempDir())
+            .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
+            .put("index.analysis.char_filter.myNormalizerChar.name", "nfkc")
+            .put("index.analysis.char_filter.myNormalizerChar.mode", "decompose")
+            .build();
+        AnalysisService analysisService = createAnalysisService(settings);
+        CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
+
+        String input = "ʰ㌰゙5℃№㈱㌘，バッファーの正規化のテスト．㋐㋑㋒㋓㋔ｶｷｸｹｺｻﾞｼﾞｽﾞｾﾞｿﾞg̈각/각நிเกषिchkʷक्षि";
+        Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
+        String expectedOutput = normalizer.normalize(input);
+        CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
+        char[] tempBuff = new char[10];
+        StringBuilder output = new StringBuilder();
+        while (true) {
+            int length = inputReader.read(tempBuff);
+            if (length == -1) break;
+            output.append(tempBuff, 0, length);
+            assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
+        }
+        assertEquals(expectedOutput, output.toString());
+    }
+}
--- a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java
+++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/TestIndexableBinaryStringTools.java
@ -0,0 +1,247 @@
+package org.elasticsearch.index.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.carrotsearch.randomizedtesting.annotations.Listeners;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope;
+import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.TimeUnits;
+import org.elasticsearch.test.junit.listeners.ReproduceInfoPrinter;
+import org.junit.BeforeClass;
+
+import java.util.Locale;
+
+/**
+ * @deprecated Remove when IndexableBinaryStringTools is removed.
+ */
+@Deprecated
+@Listeners({
+  ReproduceInfoPrinter.class
+})
+@ThreadLeakScope(Scope.NONE)
+@TimeoutSuite(millis = TimeUnits.HOUR)
+@LuceneTestCase.SuppressSysoutChecks(bugUrl = "we log a lot on purpose")
+public class TestIndexableBinaryStringTools extends LuceneTestCase {
+  private static int NUM_RANDOM_TESTS;
+  private static int MAX_RANDOM_BINARY_LENGTH;
+  
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    NUM_RANDOM_TESTS = atLeast(200);
+    MAX_RANDOM_BINARY_LENGTH = atLeast(300);
+  }
+  
+  public void testSingleBinaryRoundTrip() {
+    byte[] binary = new byte[] { (byte) 0x23, (byte) 0x98, (byte) 0x13,
+        (byte) 0xE4, (byte) 0x76, (byte) 0x41, (byte) 0xB2, (byte) 0xC9,
+        (byte) 0x7F, (byte) 0x0A, (byte) 0xA6, (byte) 0xD8 };
+
+    int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
+        binary.length);
+    char encoded[] = new char[encodedLen];
+    IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
+        encoded.length);
+
+    int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
+        encoded.length);
+    byte decoded[] = new byte[decodedLen];
+    IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
+        decoded.length);
+
+    assertEquals("Round trip decode/decode returned different results:"
+        + System.getProperty("line.separator") + "original: "
+        + binaryDump(binary, binary.length)
+        + System.getProperty("line.separator") + " encoded: "
+        + charArrayDump(encoded, encoded.length)
+        + System.getProperty("line.separator") + " decoded: "
+        + binaryDump(decoded, decoded.length),
+        binaryDump(binary, binary.length), binaryDump(decoded, decoded.length));
+  }
+  
+  public void testEncodedSortability() {
+    byte[] originalArray1 = new byte[MAX_RANDOM_BINARY_LENGTH];
+    char[] originalString1 = new char[MAX_RANDOM_BINARY_LENGTH];
+    char[] encoded1 = new char[MAX_RANDOM_BINARY_LENGTH * 10];
+    byte[] original2 = new byte[MAX_RANDOM_BINARY_LENGTH];
+    char[] originalString2 = new char[MAX_RANDOM_BINARY_LENGTH];
+    char[] encoded2 = new char[MAX_RANDOM_BINARY_LENGTH * 10];
+
+    for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) {
+      int numBytes1 = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
+
+      for (int byteNum = 0; byteNum < numBytes1; ++byteNum) {
+        int randomInt = random().nextInt(0x100);
+        originalArray1[byteNum] = (byte) randomInt;
+        originalString1[byteNum] = (char) randomInt;
+      }
+
+      int numBytes2 = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
+
+      for (int byteNum = 0; byteNum < numBytes2; ++byteNum) {
+        int randomInt = random().nextInt(0x100);
+        original2[byteNum] = (byte) randomInt;
+        originalString2[byteNum] = (char) randomInt;
+      }
+      int originalComparison = new String(originalString1, 0, numBytes1)
+          .compareTo(new String(originalString2, 0, numBytes2));
+      originalComparison = originalComparison < 0 ? -1
+          : originalComparison > 0 ? 1 : 0;
+
+      int encodedLen1 = IndexableBinaryStringTools.getEncodedLength(
+          originalArray1, 0, numBytes1);
+      if (encodedLen1 > encoded1.length)
+        encoded1 = new char[ArrayUtil.oversize(encodedLen1, RamUsageEstimator.NUM_BYTES_CHAR)];
+      IndexableBinaryStringTools.encode(originalArray1, 0, numBytes1, encoded1,
+          0, encodedLen1);
+
+      int encodedLen2 = IndexableBinaryStringTools.getEncodedLength(original2,
+          0, numBytes2);
+      if (encodedLen2 > encoded2.length)
+        encoded2 = new char[ArrayUtil.oversize(encodedLen2, RamUsageEstimator.NUM_BYTES_CHAR)];
+      IndexableBinaryStringTools.encode(original2, 0, numBytes2, encoded2, 0,
+          encodedLen2);
+
+      int encodedComparison = new String(encoded1, 0, encodedLen1)
+          .compareTo(new String(encoded2, 0, encodedLen2));
+      encodedComparison = encodedComparison < 0 ? -1
+          : encodedComparison > 0 ? 1 : 0;
+
+      assertEquals("Test #" + (testNum + 1)
+          + ": Original bytes and encoded chars compare differently:"
+          + System.getProperty("line.separator") + " binary 1: "
+          + binaryDump(originalArray1, numBytes1)
+          + System.getProperty("line.separator") + " binary 2: "
+          + binaryDump(original2, numBytes2)
+          + System.getProperty("line.separator") + "encoded 1: "
+          + charArrayDump(encoded1, encodedLen1)
+          + System.getProperty("line.separator") + "encoded 2: "
+          + charArrayDump(encoded2, encodedLen2)
+          + System.getProperty("line.separator"), originalComparison,
+          encodedComparison);
+    }
+  }
+
+  public void testEmptyInput() {
+    byte[] binary = new byte[0];
+
+    int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
+        binary.length);
+    char[] encoded = new char[encodedLen];
+    IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
+        encoded.length);
+
+    int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
+        encoded.length);
+    byte[] decoded = new byte[decodedLen];
+    IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
+        decoded.length);
+
+    assertEquals("decoded empty input was not empty", decoded.length, 0);
+  }
+  
+  public void testAllNullInput() {
+    byte[] binary = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
+        binary.length);
+    char encoded[] = new char[encodedLen];
+    IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
+        encoded.length);
+
+    int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
+        encoded.length);
+    byte[] decoded = new byte[decodedLen];
+    IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
+        decoded.length);
+
+    assertEquals("Round trip decode/decode returned different results:"
+        + System.getProperty("line.separator") + "  original: "
+        + binaryDump(binary, binary.length)
+        + System.getProperty("line.separator") + "decodedBuf: "
+        + binaryDump(decoded, decoded.length),
+        binaryDump(binary, binary.length), binaryDump(decoded, decoded.length));
+  }
+  
+  public void testRandomBinaryRoundTrip() {
+    byte[] binary = new byte[MAX_RANDOM_BINARY_LENGTH];
+    char[] encoded = new char[MAX_RANDOM_BINARY_LENGTH * 10];
+    byte[] decoded = new byte[MAX_RANDOM_BINARY_LENGTH];
+    for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) {
+      int numBytes = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1                                                                   
+
+      for (int byteNum = 0; byteNum < numBytes; ++byteNum) {
+        binary[byteNum] = (byte) random().nextInt(0x100);
+      }
+
+      int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
+          numBytes);
+      if (encoded.length < encodedLen)
+        encoded = new char[ArrayUtil.oversize(encodedLen, RamUsageEstimator.NUM_BYTES_CHAR)];
+      IndexableBinaryStringTools.encode(binary, 0, numBytes, encoded, 0,
+          encodedLen);
+
+      int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
+          encodedLen);
+      IndexableBinaryStringTools.decode(encoded, 0, encodedLen, decoded, 0,
+          decodedLen);
+
+      assertEquals("Test #" + (testNum + 1)
+          + ": Round trip decode/decode returned different results:"
+          + System.getProperty("line.separator") + "  original: "
+          + binaryDump(binary, numBytes) + System.getProperty("line.separator")
+          + "encodedBuf: " + charArrayDump(encoded, encodedLen)
+          + System.getProperty("line.separator") + "decodedBuf: "
+          + binaryDump(decoded, decodedLen), binaryDump(binary, numBytes),
+          binaryDump(decoded, decodedLen));
+    }
+  }
+  
+  public String binaryDump(byte[] binary, int numBytes) {
+    StringBuilder buf = new StringBuilder();
+    for (int byteNum = 0 ; byteNum < numBytes ; ++byteNum) {
+      String hex = Integer.toHexString(binary[byteNum] & 0xFF);
+      if (hex.length() == 1) {
+        buf.append('0');
+      }
+      buf.append(hex.toUpperCase(Locale.ROOT));
+      if (byteNum < numBytes - 1) {
+        buf.append(' ');
+      }
+    }
+    return buf.toString();
+  }
+
+  public String charArrayDump(char[] charArray, int numBytes) {
+    StringBuilder buf = new StringBuilder();
+    for (int charNum = 0 ; charNum < numBytes ; ++charNum) {
+      String hex = Integer.toHexString(charArray[charNum]);
+      for (int digit = 0 ; digit < 4 - hex.length() ; ++digit) {
+        buf.append('0');
+      }
+      buf.append(hex.toUpperCase(Locale.ROOT));
+      if (charNum < numBytes - 1) {
+        buf.append(' ');
+      }
+    }
+    return buf.toString();
+  }
+}