From 5c6b4f4f65026ecf8dbe3a5390966262178d6b18 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 2 Nov 2010 12:03:18 +0000
Subject: [PATCH] SOLR-2210: add factories for icu analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1030012 13f79535-47bb-0310-9956-ffa450edef68
---
 solr/CHANGES.txt                              |   2 +
 solr/build.xml                                |  13 +-
 solr/common-build.xml                         |   3 +
 solr/contrib/analysis-extras/README.txt       |  16 ++
 solr/contrib/analysis-extras/build.xml        | 203 ++++++++++++++++++
 .../analysis-extras/lib/icu4j-4_4_2.jar       |   2 +
 .../ICUCollationKeyFilterFactory.java         | 142 ++++++++++++
 .../analysis/ICUFoldingFilterFactory.java     |  30 +++
 .../analysis/ICUNormalizer2FilterFactory.java |  81 +++++++
 .../solr/analysis/ICUTokenizerFactory.java    |  32 +++
 .../analysis/ICUTransformFilterFactory.java   |  67 ++++++
 .../TestICUCollationKeyFilterFactory.java     | 170 +++++++++++++++
 .../analysis/TestICUFoldingFilterFactory.java |  39 ++++
 .../TestICUNormalizer2FilterFactory.java      |  41 ++++
 .../analysis/TestICUTokenizerFactory.java     |  35 +++
 .../TestICUTransformFilterFactory.java        |  64 ++++++
 16 files changed, 936 insertions(+), 4 deletions(-)
 create mode 100644 solr/contrib/analysis-extras/README.txt
 create mode 100644 solr/contrib/analysis-extras/build.xml
 create mode 100644 solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar
 create mode 100644 solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
 create mode 100644 solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java
 create mode 100644 solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java
 create mode 100644 solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java
 create mode 100644 solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java
 create mode 100644 solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
 create mode 100644 solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java
 create mode 100644 solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java
 create mode 100644 solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java
 create mode 100644 solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 4c69968943b..d5b6701de7d 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -297,6 +297,8 @@ New Features
   built-in load balancing, and infrastructure for future SolrCloud work. 
   (yonik, Mark Miller)
 
+* SOLR-2210: Add icu-based tokenizer and filters to contrib/analysis-extras (rmuir)
+
 Optimizations
 ----------------------
 
diff --git a/solr/build.xml b/solr/build.xml
index b1798cb80f7..81e66356c10 100644
--- a/solr/build.xml
+++ b/solr/build.xml
@@ -34,9 +34,6 @@
   <property name="clover.db.dir" location="${dest}/tests/clover/db"/>
   <property name="clover.report.dir" location="${dest}/tests/clover/reports"/>
   
-  <!-- change this together with the default and test's solrconfig.xml after starting a new development branch: -->
-  <property name="tests.luceneMatchVersion" value="4.0"/>
-
     <available
             property="clover.present"
             classname="com.cenqua.clover.tasks.CloverReportTask"
@@ -221,6 +218,7 @@
           <packageset dir="contrib/dataimporthandler/src/main/java" />
           <!--<packageset dir="contrib/clustering/src/main/java" />-->
           <packageset dir="contrib/extraction/src/main/java" />
+          <packageset dir="contrib/analysis-extras/src/java" />
           <group title="Core" packages="org.apache.*" />
           <group title="Common" packages="org.apache.solr.common.*" />
           <group title="SolrJ" packages="org.apache.solr.client.solrj*" />
@@ -509,6 +507,7 @@
       <fileset dir="contrib/dataimporthandler/src/main/java" />
       <fileset dir="contrib/clustering/src/main/java" />
       <fileset dir="contrib/extraction/src/main/java" />
+      <fileset dir="contrib/analysis-extras/src/java" />
     </clover-setup>
   </target>
 
@@ -609,6 +608,8 @@
               basedir="contrib/extraction/src" />
     <!--<solr-jar destfile="${dist}/apache-solr-clustering-src-${version}.jar"
               basedir="contrib/clustering/src" />-->
+    <solr-jar destfile="${dist}/apache-solr-analysis-extras-src-${version}.jar"
+              basedir="contrib/analysis-extras/src" />
   </target>
 
   <target name="dist-javadoc" description="Creates the Solr javadoc distribution files"
@@ -625,6 +626,8 @@
               basedir="${build.javadoc}/contrib-solr-clustering" />-->
     <solr-jar destfile="${dist}/apache-solr-cell-docs-${version}.jar"
               basedir="${build.javadoc}/contrib-solr-cell" />
+    <solr-jar destfile="${dist}/apache-solr-analysis-extras-docs-${version}.jar"
+              basedir="${build.javadoc}/contrib-solr-analysis-extras" />
   </target>
 
   <!-- Creates the solr jar. -->
@@ -721,7 +724,7 @@
       <tarfileset dir="."
         prefix="${fullnamever}"
         includes="LICENSE.txt NOTICE.txt *.txt *.xml lucene-libs/** lib/** src/** example/** client/** contrib/"
-        excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/**" />
+        excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/** contrib/analysis-extras/lib/**" />
       <tarfileset dir="."
         prefix="${fullnamever}"
         includes="src/test/test-files/solr/lib/classes/empty-file-main-lib.txt" />
@@ -952,6 +955,8 @@
       <fileset dir="contrib/clustering/src/test/java"/>
       <fileset dir="contrib/extraction/src/main/java"/>
       <fileset dir="contrib/extraction/src/test/java"/>
+      <fileset dir="contrib/analysis-extras/src/test"/>
+      <fileset dir="contrib/analysis-extras/src/test"/>
     </rat:report>
   </target>
 
diff --git a/solr/common-build.xml b/solr/common-build.xml
index de0aacae36f..d95925c2d26 100644
--- a/solr/common-build.xml
+++ b/solr/common-build.xml
@@ -23,6 +23,9 @@
 
   <dirname file="${ant.file.common-solr}" property="common-solr.dir"/>
   
+  <!-- change this together with the default and test's solrconfig.xml after starting a new development branch: -->
+  <property name="tests.luceneMatchVersion" value="4.0"/>
+
   <!-- Initialize property values: allow easy customization via build.properties -->
   <property file="build.properties" />
 
diff --git a/solr/contrib/analysis-extras/README.txt b/solr/contrib/analysis-extras/README.txt
new file mode 100644
index 00000000000..2c60e0e7917
--- /dev/null
+++ b/solr/contrib/analysis-extras/README.txt
@@ -0,0 +1,16 @@
+The analysis-extras plugin provides additional analyzers that rely
+upon large dependencies/dictionaries.
+
+It includes integration with ICU for multilingual support, and 
+analyzers for Chinese and Polish.
+
+Relies upon the following lucene components (in lucene-libs/):
+
+ * lucene-analyzers-icu-X.Y.jar
+ * lucene-analyzers-smartcn-X.Y.jar
+ * lucene-analyzers-stempel-X.Y.jar
+ 
+And the ICU library (in lib/):
+
+ * icu4j-X.Y.jar
+ 
\ No newline at end of file
diff --git a/solr/contrib/analysis-extras/build.xml b/solr/contrib/analysis-extras/build.xml
new file mode 100644
index 00000000000..af4a13da4b2
--- /dev/null
+++ b/solr/contrib/analysis-extras/build.xml
@@ -0,0 +1,203 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+ 
+        http://www.apache.org/licenses/LICENSE-2.0
+ 
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="solr-extraAnalyzers" default="build">
+
+  <property name="solr-path" value="../.."/>
+
+  <import file="../../common-build.xml"/>
+
+  <description>
+    Additional analysis components
+  </description>
+
+  <property name="example.local" value="example"/>
+  
+  <!-- support for the additional analyzers modules -->
+  <path id="modules.classpath">
+    <pathelement location="${common-solr.dir}/../modules/analysis/build/icu/classes/java" />
+    <pathelement location="${common-solr.dir}/../modules/analysis/build/smartcn/classes/java" />
+  	<pathelement location="${common-solr.dir}/../modules/analysis/build/stempel/classes/java" />
+  </path>
+	
+  <target name="prep-module-jars">
+    <subant target="jar" inheritall="false" failonerror="true">
+      <fileset dir="${common-solr.dir}/../modules/analysis/icu" includes="build.xml" />
+      <fileset dir="${common-solr.dir}/../modules/analysis/smartcn" includes="build.xml" />
+      <fileset dir="${common-solr.dir}/../modules/analysis/stempel" includes="build.xml" />
+    </subant>
+  </target>
+
+  <target name="module-jars-to-solr" depends="prep-module-jars">
+    <mkdir dir="${lucene-libs}"/>
+    <copy todir="${lucene-libs}" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true">
+      <fileset dir="${common-solr.dir}/../modules/analysis/build/icu">
+        <include name="lucene-analyzers-icu-${version}.jar" />
+      </fileset>
+      <fileset dir="${common-solr.dir}/../modules/analysis/build/smartcn">
+          <include name="lucene-analyzers-smartcn-${version}.jar" />
+  	  </fileset>
+      <fileset dir="${common-solr.dir}/../modules/analysis/build/stempel">
+        <include name="lucene-analyzers-stempel-${version}.jar" />
+	  </fileset>
+    </copy>
+  </target>
+	 
+  <path id="common.classpath">
+    <fileset dir="lib"/>
+    <pathelement location="${solr-path}/build/solr"/>
+    <pathelement location="${solr-path}/build/solrj"/>
+    <path refid="lucene.classpath"/>
+  	<path refid="modules.classpath"/>
+    <fileset dir="${solr-path}/lib" includes="*.jar"/>
+  </path>
+
+  <path id="test.classpath">
+    <pathelement path="${dest}/classes"/>
+    <pathelement path="${dest}/test-classes"/>
+    <pathelement path="${java.class.path}"/>
+    <pathelement location="${common-solr.dir}/build/tests"/> <!-- include solr test code -->
+    <pathelement location="${common-solr.dir}/../lucene/build/classes/test" />  <!-- include some lucene test code -->
+    <path refid="common.classpath"/>
+  </path>
+
+  <target name="clean">
+    <delete failonerror="false" dir="${dest}"/>
+
+    <!-- example doesn't create this anymore, but clean it up
+         if it's still there from an old build
+      -->
+    <delete dir="example/lib" />
+  	<delete dir="${lucene-libs}" />
+  </target>
+
+
+  <target name="init" depends="module-jars-to-solr">
+    <mkdir dir="${dest}/classes"/>
+    
+    <mkdir dir="${build.javadoc}"/>
+    <subant target="compileTests">
+      <fileset dir="${solr-path}" includes="build.xml"/>
+    </subant>
+    <subant target="make-manifest">
+      <fileset dir="${solr-path}" includes="build.xml"/>
+    </subant>
+  </target>
+
+
+  <target name="compile" depends="init">
+    <solr-javac destdir="${dest}/classes"
+                classpathref="common.classpath">
+      <src path="src/java"/>
+    </solr-javac>
+  </target>
+
+  <target name="build" depends="compile">
+    <solr-jar destfile="${dest}/${fullnamever}.jar" basedir="${dest}/classes"
+              manifest="../../${dest}/META-INF/MANIFEST.MF"/>
+  </target>
+
+  <target name="compileTests" depends="compile">
+    <solr-javac destdir="${dest}/test-classes"
+                classpathref="test.classpath">
+      <src path="src/test"/>
+    </solr-javac>
+  </target>
+
+  <target name="example" depends="build,dist">
+    <!-- this task use to copy lib's but that's no longer needed because
+         ../lib and ../lib/downloads are now included explicitly by
+         example/conf/solrconfig.xml
+      -->
+  </target>
+
+
+  <target name="test" depends="compileTests">
+    <mkdir dir="${junit.output.dir}"/>
+
+    <junit printsummary="no"
+           haltonfailure="no"
+           maxmemory="512M"
+           errorProperty="tests.failed"
+           failureProperty="tests.failed"
+           dir="src/test/test-files/"
+           tempdir="${junit.output.dir}"
+           forkmode="perBatch"
+            >
+      <sysproperty key="java.util.logging.config.file" value="${common-solr.dir}/testlogging.properties"/>
+      <sysproperty key="tests.luceneMatchVersion" value="${tests.luceneMatchVersion}"/>
+      <sysproperty key="tests.codec" value="${tests.codec}"/>
+      <sysproperty key="tests.locale" value="${tests.locale}"/>
+      <sysproperty key="tests.timezone" value="${tests.timezone}"/>
+      <sysproperty key="tests.multiplier" value="${tests.multiplier}"/>
+      <sysproperty key="tests.seed" value="${tests.seed}"/>
+      <sysproperty key="tests.iter" value="${tests.iter}"/>
+      <sysproperty key="jetty.testMode" value="1"/>
+      <sysproperty key="tempDir" file="${junit.output.dir}"/>
+      <sysproperty key="testmethod" value="${testmethod}"/>
+      <jvmarg line="${args}"/>
+      <formatter classname="${junit.details.formatter}" usefile="false" if="junit.details"/>
+      <classpath refid="test.classpath"/>
+      <assertions>
+        <enable package="org.apache.lucene"/>
+        <enable package="org.apache.solr"/>
+      </assertions>
+      <formatter type="${junit.formatter}"/>
+      <batchtest fork="yes" todir="${junit.output.dir}" unless="testcase">
+        <fileset dir="src/test" includes="${junit.includes}"/>
+      </batchtest>
+      <batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
+        <fileset dir="src/test" includes="**/${testcase}.java"/>
+      </batchtest>
+    </junit>
+
+    <fail if="tests.failed">Tests failed!</fail>
+  </target>
+
+  <target name="dist" depends="build">
+    <!--
+      <copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/build/web/WEB-INF/lib"/>
+      <copy todir="${solr-path}/build/web/WEB-INF/lib" flatten="true">
+        <fileset dir="lib">
+          <include name="**/*.jar"/>
+        </fileset>
+      </copy>
+    -->
+    <copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/dist"/>
+  </target>
+
+  <target name="javadoc">
+    <sequential>
+      <mkdir dir="${build.javadoc}/contrib-${name}"/>
+
+      <path id="javadoc.classpath">
+        <path refid="common.classpath"/>
+      </path>
+
+      <invoke-javadoc
+              destdir="${build.javadoc}/contrib-${name}"
+              title="${Name} ${version} contrib-${fullnamever} API">
+        <sources>
+          <packageset dir="src/java"/>
+        </sources>
+      </invoke-javadoc>
+    </sequential>
+  </target>
+
+</project>
diff --git a/solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar b/solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar
new file mode 100644
index 00000000000..3120f680cb3
--- /dev/null
+++ b/solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar
@@ -0,0 +1,2 @@
+AnyObjectId[4d9d4e1277822f7a08dd9469ae2ca81d44902552] was removed in git history.
+Apache SVN contains full history.
\ No newline at end of file
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
new file mode 100644
index 00000000000..1a79de899f6
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
@@ -0,0 +1,142 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.InputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.collation.ICUCollationKeyFilter;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Factory for {@link ICUCollationKeyFilter}.
+ * <p>
+ * This factory can be created in two ways: 
+ * <ul>
+ *  <li>Based upon a system collator associated with a Locale.
+ *  <li>Based upon a tailored ruleset.
+ * </ul>
+ * <p>
+ * Using a System collator:
+ * <ul>
+ *  <li>locale: RFC 3066 locale ID (mandatory)
+ *  <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
+ *  <li>decomposition: 'no', or 'canonical' (optional)
+ * </ul>
+ * <p>
+ * Using a Tailored ruleset:
+ * <ul>
+ *  <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
+ *  <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
+ *  <li>decomposition: 'no' or 'canonical' (optional)
+ * </ul>
+ *
+ * @see Collator
+ * @see ULocale
+ * @see RuleBasedCollator
+ */
+public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+  private Collator collator;
+
+  public void inform(ResourceLoader loader) {
+    String custom = args.get("custom");
+    String localeID = args.get("locale");
+    String strength = args.get("strength");
+    String decomposition = args.get("decomposition");
+    
+    if (custom == null && localeID == null)
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
+    
+    if (custom != null && localeID != null)
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. "
+          + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
+          + "Then save the entire customized ruleset to a file, and use with the custom parameter");
+    
+    if (localeID != null) { 
+      // create from a system collator, based on Locale.
+      collator = createFromLocale(localeID);
+    } else { 
+      // create from a custom ruleset
+      collator = createFromRules(custom, loader);
+    }
+    
+    // set the strength flag, otherwise it will be the default.
+    if (strength != null) {
+      if (strength.equalsIgnoreCase("primary"))
+        collator.setStrength(Collator.PRIMARY);
+      else if (strength.equalsIgnoreCase("secondary"))
+        collator.setStrength(Collator.SECONDARY);
+      else if (strength.equalsIgnoreCase("tertiary"))
+        collator.setStrength(Collator.TERTIARY);
+      else if (strength.equalsIgnoreCase("quaternary"))
+        collator.setStrength(Collator.QUATERNARY);
+      else if (strength.equalsIgnoreCase("identical"))
+        collator.setStrength(Collator.IDENTICAL);
+      else
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
+    }
+    
+    // set the decomposition flag, otherwise it will be the default.
+    if (decomposition != null) {
+      if (decomposition.equalsIgnoreCase("no"))
+        collator.setDecomposition(Collator.NO_DECOMPOSITION);
+      else if (decomposition.equalsIgnoreCase("canonical"))
+        collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
+      else
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
+    }
+  }
+  
+  public TokenStream create(TokenStream input) {
+    return new ICUCollationKeyFilter(input, collator);
+  }
+  
+  /*
+   * Create a locale from localeID.
+   * Then return the appropriate collator for the locale.
+   */
+  private Collator createFromLocale(String localeID) {
+    return Collator.getInstance(new ULocale(localeID));
+  }
+  
+  /*
+   * Read custom rules from a file, and create a RuleBasedCollator
+   * The file cannot support comments, as # might be in the rules!
+   */
+  private Collator createFromRules(String fileName, ResourceLoader loader) {
+    InputStream input = null;
+    try {
+     input = loader.openResource(fileName);
+     String rules = IOUtils.toString(input, "UTF-8");
+     return new RuleBasedCollator(rules);
+    } catch (Exception e) {
+      // io error or invalid rules
+      throw new RuntimeException(e);
+    } finally {
+      IOUtils.closeQuietly(input);
+    }
+  }
+}
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java
new file mode 100644
index 00000000000..c0aa1fbd186
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java
@@ -0,0 +1,30 @@
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.icu.ICUFoldingFilter;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Factory for {@link ICUFoldingFilter} */
+public class ICUFoldingFilterFactory extends BaseTokenFilterFactory {
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new ICUFoldingFilter(input);
+  }
+}
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java
new file mode 100644
index 00000000000..860a5c53d9a
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java
@@ -0,0 +1,81 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+
+import com.ibm.icu.text.FilteredNormalizer2;
+import com.ibm.icu.text.Normalizer2;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * Factory for {@link ICUNormalizer2Filter}
+ * <p>
+ * Supports the following attributes:
+ * <ul>
+ *   <li>name: A <a href="http://unicode.org/reports/tr15/">Unicode Normalization Form</a>, 
+ *       one of 'nfc','nfkc', 'nfkc_cf'. Default is nfkc_cf.
+ *   <li>mode: Either 'compose' or 'decompose'. Default is compose. Use "decompose" with nfc
+ *       or nfkc, to get nfd or nfkd, respectively.
+ *   <li>filter: A {@link UnicodeSet} pattern. Codepoints outside the set are
+ *       always left unchanged. Default is [] (the null set, no filtering).
+ * </ul>
+ * @see ICUNormalizer2Filter
+ * @see Normalizer2
+ * @see FilteredNormalizer2
+ */
+public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory {
+  private Normalizer2 normalizer;
+
+  // TODO: support custom normalization
+  @Override
+  public void init(Map<String,String> args) {
+    super.init(args);
+    String name = args.get("name");
+    if (name == null)
+      name = "nfkc_cf";
+    String mode = args.get("mode");
+    if (mode == null)
+      mode = "compose";
+    
+    if (mode.equals("compose"))
+      normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE);
+    else if (mode.equals("decompose"))
+      normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.DECOMPOSE);
+    else 
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid mode: " + mode);
+    
+    String filter = args.get("filter");
+    if (filter != null) {
+      UnicodeSet set = new UnicodeSet(filter);
+      if (!set.isEmpty()) {
+        set.freeze();
+        normalizer = new FilteredNormalizer2(normalizer, set);
+      }
+    }
+  }
+  
+  public TokenStream create(TokenStream input) {
+    return new ICUNormalizer2Filter(input, normalizer);
+  }
+}
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java
new file mode 100644
index 00000000000..bbda76fb291
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java
@@ -0,0 +1,32 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
+
+/** Factory for {@link ICUTokenizer} */
+public class ICUTokenizerFactory extends BaseTokenizerFactory {
+  // TODO: add support for custom configs
+  @Override
+  public Tokenizer create(Reader input) {
+    return new ICUTokenizer(input);
+  }
+}
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java
new file mode 100644
index 00000000000..449bd2055ae
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java
@@ -0,0 +1,67 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.icu.ICUTransformFilter;
+import org.apache.solr.analysis.BaseTokenFilterFactory;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+
+import com.ibm.icu.text.Transliterator;
+
+/**
+ * Factory for {@link ICUTransformFilter}.
+ * <p>
+ * Supports the following attributes:
+ * <ul>
+ *   <li>id (mandatory): A Transliterator ID, one from {@link Transliterator#getAvailableIDs()}
+ *   <li>direction (optional): Either 'forward' or 'reverse'. Default is forward.
+ * </ul>
+ * @see Transliterator
+ */
+public class ICUTransformFilterFactory extends BaseTokenFilterFactory {
+  private Transliterator transliterator;
+  
+  // TODO: add support for custom rules
+  @Override
+  public void init(Map<String,String> args) {
+    super.init(args);
+    String id = args.get("id");
+    if (id == null) {
+      throw new SolrException(ErrorCode.SERVER_ERROR, "id is required.");
+    }
+    
+    int dir;
+    String direction = args.get("direction");
+    if (direction == null || direction.equalsIgnoreCase("forward"))
+      dir = Transliterator.FORWARD;
+    else if (direction.equalsIgnoreCase("reverse"))
+      dir = Transliterator.REVERSE;
+    else
+      throw new SolrException(ErrorCode.SERVER_ERROR, "invalid direction: " + direction);
+    
+    transliterator = Transliterator.getInstance(id, dir);
+  }
+
+  public TokenStream create(TokenStream input) {
+    return new ICUTransformFilter(input, transliterator);
+  }
+}
diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
new file mode 100644
index 00000000000..44c42f6f2f6
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
@@ -0,0 +1,170 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.solr.common.ResourceLoader;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+
+public class TestICUCollationKeyFilterFactory extends BaseTokenTestCase {
+
+  /*
+   * Turkish has some funny casing.
+   * This test shows how you can solve this kind of thing easily with collation.
+   * Instead of using LowerCaseFilter, use a turkish collator with primary strength.
+   * Then things will sort and match correctly.
+   */
+  public void testBasicUsage() throws IOException {
+    String turkishUpperCase = "I WİLL USE TURKİSH CASING";
+    String turkishLowerCase = "ı will use turkish casıng";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("locale", "tr");
+    args.put("strength", "primary");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsUpper = factory.create(
+        new KeywordTokenizer(new StringReader(turkishUpperCase)));
+    TokenStream tsLower = factory.create(
+        new KeywordTokenizer(new StringReader(turkishLowerCase)));
+    assertCollatesToSame(tsUpper, tsLower);
+  }
+  
+  /*
+   * Test usage of the decomposition option for unicode normalization.
+   */
+  public void testNormalization() throws IOException {
+    String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
+    String turkishLowerCase = "ı will use turkish casıng";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("locale", "tr");
+    args.put("strength", "primary");
+    args.put("decomposition", "canonical");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsUpper = factory.create(
+        new KeywordTokenizer(new StringReader(turkishUpperCase)));
+    TokenStream tsLower = factory.create(
+        new KeywordTokenizer(new StringReader(turkishLowerCase)));
+    assertCollatesToSame(tsUpper, tsLower);
+  }
+  
+  /*
+   * Test secondary strength, for english case is not significant.
+   */
+  public void testSecondaryStrength() throws IOException {
+    String upperCase = "TESTING";
+    String lowerCase = "testing";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("locale", "en");
+    args.put("strength", "secondary");
+    args.put("decomposition", "no");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsUpper = factory.create(
+        new KeywordTokenizer(new StringReader(upperCase)));
+    TokenStream tsLower = factory.create(
+        new KeywordTokenizer(new StringReader(lowerCase)));
+    assertCollatesToSame(tsUpper, tsLower);
+  }
+
+  /*
+   * For german, you might want oe to sort and match with o umlaut.
+   * This is not the default, but you can make a customized ruleset to do this.
+   *
+   * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
+   *  http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
+   */
+  public void testCustomRules() throws Exception {
+    RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
+
+    String DIN5007_2_tailorings =
+      "& ae , a\u0308 & AE , A\u0308"+
+      "& oe , o\u0308 & OE , O\u0308"+
+      "& ue , u\u0308 & UE , u\u0308";
+
+    RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
+    String tailoredRules = tailoredCollator.getRules();
+    //
+    // at this point, you would save these tailoredRules to a file, 
+    // and use the custom parameter.
+    //
+    String germanUmlaut = "Töne";
+    String germanOE = "Toene";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("custom", "rules.txt");
+    args.put("strength", "primary");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(tailoredRules));
+    TokenStream tsUmlaut = factory.create(
+        new KeywordTokenizer(new StringReader(germanUmlaut)));
+    TokenStream tsOE = factory.create(
+        new KeywordTokenizer(new StringReader(germanOE)));
+
+    assertCollatesToSame(tsUmlaut, tsOE);
+  }
+  
+  private class StringMockSolrResourceLoader implements ResourceLoader {
+    String text;
+
+    StringMockSolrResourceLoader(String text) {
+      this.text = text;
+    }
+
+    public List<String> getLines(String resource) throws IOException {
+      return null;
+    }
+
+    public Object newInstance(String cname, String... subpackages) {
+      return null;
+    }
+
+    public InputStream openResource(String resource) throws IOException {
+      return new ByteArrayInputStream(text.getBytes("UTF-8"));
+    }
+  }
+  
+  private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
+      throws IOException {
+    CharTermAttribute term1 = stream1
+        .addAttribute(CharTermAttribute.class);
+    CharTermAttribute term2 = stream2
+        .addAttribute(CharTermAttribute.class);
+    assertTrue(stream1.incrementToken());
+    assertTrue(stream2.incrementToken());
+    assertEquals(term1.toString(), term2.toString());
+    assertFalse(stream1.incrementToken());
+    assertFalse(stream2.incrementToken());
+  }
+}
diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java
new file mode 100644
index 00000000000..5fc3d653f8a
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java
@@ -0,0 +1,39 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/** basic tests for {@link ICUFoldingFilterFactory} */
+public class TestICUFoldingFilterFactory extends BaseTokenTestCase {
+  
+  /** basic tests to ensure the folding is working */
+  public void test() throws Exception {
+    Reader reader = new StringReader("Résumé");
+    ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "resume" });
+  }
+}
diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java
new file mode 100644
index 00000000000..200890d3298
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java
@@ -0,0 +1,41 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/** basic tests for {@link ICUNormalizer2FilterFactory} */
+public class TestICUNormalizer2FilterFactory extends BaseTokenTestCase {
+  
+  /** Test nfkc_cf defaults */
+  public void testDefaults() throws Exception {
+    Reader reader = new StringReader("This is a Ｔｅｓｔ");
+    ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" });
+  }
+  
+  // TODO: add tests for different forms
+}
diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java
new file mode 100644
index 00000000000..8b6992ec0b6
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java
@@ -0,0 +1,35 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/** basic tests for {@link ICUTokenizerFactory} **/
+public class TestICUTokenizerFactory extends BaseTokenTestCase {
+  public void testMixedText() throws Exception {
+    Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี  This is a test ກວ່າດອກ");
+    ICUTokenizerFactory factory = new ICUTokenizerFactory();
+    TokenStream stream = factory.create(reader);
+    assertTokenStreamContents(stream,
+        new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี",
+        "This", "is", "a", "test", "ກວ່າ", "ດອກ"});
+  }
+}
diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java
new file mode 100644
index 00000000000..9df2c998570
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java
@@ -0,0 +1,64 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/** basic tests for {@link ICUTransformFilterFactory} */
+public class TestICUTransformFilterFactory extends BaseTokenTestCase {
+  
+  /** ensure the transform is working */
+  public void test() throws Exception {
+    Reader reader = new StringReader("簡化字");
+    ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("id", "Traditional-Simplified");
+    factory.init(args);
+    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "简化字" });
+  }
+  
+  /** test forward and reverse direction */
+  public void testDirection() throws Exception {
+    // forward
+    Reader reader = new StringReader("Российская Федерация");
+    ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("id", "Cyrillic-Latin");
+    factory.init(args);
+    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "Rossijskaâ",  "Federaciâ" });
+    
+    // backward (invokes Latin-Cyrillic)
+    reader = new StringReader("Rossijskaâ Federaciâ");
+    args.put("direction", "reverse");
+    factory.init(args);
+    tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+    stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "Российская", "Федерация" });
+  }
+}