diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 4c69968943b..d5b6701de7d 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -297,6 +297,8 @@ New Features
built-in load balancing, and infrastructure for future SolrCloud work.
(yonik, Mark Miller)
+* SOLR-2210: Add icu-based tokenizer and filters to contrib/analysis-extras (rmuir)
+
Optimizations
----------------------
diff --git a/solr/build.xml b/solr/build.xml
index b1798cb80f7..81e66356c10 100644
--- a/solr/build.xml
+++ b/solr/build.xml
@@ -34,9 +34,6 @@
-
-
-
+
@@ -509,6 +507,7 @@
+
@@ -609,6 +608,8 @@
basedir="contrib/extraction/src" />
+
-->
+
@@ -721,7 +724,7 @@
+ excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/** contrib/analysis-extras/lib/**" />
@@ -952,6 +955,8 @@
+
+
diff --git a/solr/common-build.xml b/solr/common-build.xml
index de0aacae36f..d95925c2d26 100644
--- a/solr/common-build.xml
+++ b/solr/common-build.xml
@@ -23,6 +23,9 @@
+
+
+
diff --git a/solr/contrib/analysis-extras/README.txt b/solr/contrib/analysis-extras/README.txt
new file mode 100644
index 00000000000..2c60e0e7917
--- /dev/null
+++ b/solr/contrib/analysis-extras/README.txt
@@ -0,0 +1,16 @@
+The analysis-extras plugin provides additional analyzers that rely
+upon large dependencies/dictionaries.
+
+It includes integration with ICU for multilingual support, and
+analyzers for Chinese and Polish.
+
+Relies upon the following lucene components (in lucene-libs/):
+
+ * lucene-analyzers-icu-X.Y.jar
+ * lucene-analyzers-smartcn-X.Y.jar
+ * lucene-analyzers-stempel-X.Y.jar
+
+And the ICU library (in lib/):
+
+ * icu4j-X.Y.jar
+
\ No newline at end of file
diff --git a/solr/contrib/analysis-extras/build.xml b/solr/contrib/analysis-extras/build.xml
new file mode 100644
index 00000000000..af4a13da4b2
--- /dev/null
+++ b/solr/contrib/analysis-extras/build.xml
@@ -0,0 +1,203 @@
+
+
+
+
+
+
+
+
+
+
+
+ Additional analysis components
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tests failed!
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar b/solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar
new file mode 100644
index 00000000000..3120f680cb3
--- /dev/null
+++ b/solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar
@@ -0,0 +1,2 @@
+AnyObjectId[4d9d4e1277822f7a08dd9469ae2ca81d44902552] was removed in git history.
+Apache SVN contains full history.
\ No newline at end of file
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
new file mode 100644
index 00000000000..1a79de899f6
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
@@ -0,0 +1,142 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.InputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.collation.ICUCollationKeyFilter;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Factory for {@link ICUCollationKeyFilter}.
+ *
+ * This factory can be created in two ways:
+ *
+ * - Based upon a system collator associated with a Locale.
+ *
- Based upon a tailored ruleset.
+ *
+ *
+ * Using a System collator:
+ *
+ * - locale: RFC 3066 locale ID (mandatory)
+ *
- strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
+ *
- decomposition: 'no', or 'canonical' (optional)
+ *
+ *
+ * Using a Tailored ruleset:
+ *
+ * - custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
+ *
- strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
+ *
- decomposition: 'no' or 'canonical' (optional)
+ *
+ *
+ * @see Collator
+ * @see ULocale
+ * @see RuleBasedCollator
+ */
+public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+ private Collator collator;
+
+ public void inform(ResourceLoader loader) {
+ String custom = args.get("custom");
+ String localeID = args.get("locale");
+ String strength = args.get("strength");
+ String decomposition = args.get("decomposition");
+
+ if (custom == null && localeID == null)
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
+
+ if (custom != null && localeID != null)
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. "
+ + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
+ + "Then save the entire customized ruleset to a file, and use with the custom parameter");
+
+ if (localeID != null) {
+ // create from a system collator, based on Locale.
+ collator = createFromLocale(localeID);
+ } else {
+ // create from a custom ruleset
+ collator = createFromRules(custom, loader);
+ }
+
+ // set the strength flag, otherwise it will be the default.
+ if (strength != null) {
+ if (strength.equalsIgnoreCase("primary"))
+ collator.setStrength(Collator.PRIMARY);
+ else if (strength.equalsIgnoreCase("secondary"))
+ collator.setStrength(Collator.SECONDARY);
+ else if (strength.equalsIgnoreCase("tertiary"))
+ collator.setStrength(Collator.TERTIARY);
+ else if (strength.equalsIgnoreCase("quaternary"))
+ collator.setStrength(Collator.QUATERNARY);
+ else if (strength.equalsIgnoreCase("identical"))
+ collator.setStrength(Collator.IDENTICAL);
+ else
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
+ }
+
+ // set the decomposition flag, otherwise it will be the default.
+ if (decomposition != null) {
+ if (decomposition.equalsIgnoreCase("no"))
+ collator.setDecomposition(Collator.NO_DECOMPOSITION);
+ else if (decomposition.equalsIgnoreCase("canonical"))
+ collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
+ else
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
+ }
+ }
+
+ public TokenStream create(TokenStream input) {
+ return new ICUCollationKeyFilter(input, collator);
+ }
+
+ /*
+ * Create a locale from localeID.
+ * Then return the appropriate collator for the locale.
+ */
+ private Collator createFromLocale(String localeID) {
+ return Collator.getInstance(new ULocale(localeID));
+ }
+
+ /*
+ * Read custom rules from a file, and create a RuleBasedCollator
+ * The file cannot support comments, as # might be in the rules!
+ */
+ private Collator createFromRules(String fileName, ResourceLoader loader) {
+ InputStream input = null;
+ try {
+ input = loader.openResource(fileName);
+ String rules = IOUtils.toString(input, "UTF-8");
+ return new RuleBasedCollator(rules);
+ } catch (Exception e) {
+ // io error or invalid rules
+ throw new RuntimeException(e);
+ } finally {
+ IOUtils.closeQuietly(input);
+ }
+ }
+}
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java
new file mode 100644
index 00000000000..c0aa1fbd186
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java
@@ -0,0 +1,30 @@
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.icu.ICUFoldingFilter;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Factory for {@link ICUFoldingFilter} */
+public class ICUFoldingFilterFactory extends BaseTokenFilterFactory {
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new ICUFoldingFilter(input);
+ }
+}
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java
new file mode 100644
index 00000000000..860a5c53d9a
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java
@@ -0,0 +1,81 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+
+import com.ibm.icu.text.FilteredNormalizer2;
+import com.ibm.icu.text.Normalizer2;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * Factory for {@link ICUNormalizer2Filter}
+ *
+ * Supports the following attributes:
+ *
+ * - name: A Unicode Normalization Form,
+ * one of 'nfc','nfkc', 'nfkc_cf'. Default is nfkc_cf.
+ *
- mode: Either 'compose' or 'decompose'. Default is compose. Use "decompose" with nfc
+ * or nfkc, to get nfd or nfkd, respectively.
+ *
- filter: A {@link UnicodeSet} pattern. Codepoints outside the set are
+ * always left unchanged. Default is [] (the null set, no filtering).
+ *
+ * @see ICUNormalizer2Filter
+ * @see Normalizer2
+ * @see FilteredNormalizer2
+ */
+public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory {
+ private Normalizer2 normalizer;
+
+ // TODO: support custom normalization
+ @Override
+ public void init(Map args) {
+ super.init(args);
+ String name = args.get("name");
+ if (name == null)
+ name = "nfkc_cf";
+ String mode = args.get("mode");
+ if (mode == null)
+ mode = "compose";
+
+ if (mode.equals("compose"))
+ normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE);
+ else if (mode.equals("decompose"))
+ normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.DECOMPOSE);
+ else
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid mode: " + mode);
+
+ String filter = args.get("filter");
+ if (filter != null) {
+ UnicodeSet set = new UnicodeSet(filter);
+ if (!set.isEmpty()) {
+ set.freeze();
+ normalizer = new FilteredNormalizer2(normalizer, set);
+ }
+ }
+ }
+
+ public TokenStream create(TokenStream input) {
+ return new ICUNormalizer2Filter(input, normalizer);
+ }
+}
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java
new file mode 100644
index 00000000000..bbda76fb291
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java
@@ -0,0 +1,32 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
+
+/** Factory for {@link ICUTokenizer} */
+public class ICUTokenizerFactory extends BaseTokenizerFactory {
+ // TODO: add support for custom configs
+ @Override
+ public Tokenizer create(Reader input) {
+ return new ICUTokenizer(input);
+ }
+}
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java
new file mode 100644
index 00000000000..449bd2055ae
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java
@@ -0,0 +1,67 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.icu.ICUTransformFilter;
+import org.apache.solr.analysis.BaseTokenFilterFactory;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+
+import com.ibm.icu.text.Transliterator;
+
+/**
+ * Factory for {@link ICUTransformFilter}.
+ *
+ * Supports the following attributes:
+ *
+ * - id (mandatory): A Transliterator ID, one from {@link Transliterator#getAvailableIDs()}
+ *
- direction (optional): Either 'forward' or 'reverse'. Default is forward.
+ *
+ * @see Transliterator
+ */
+public class ICUTransformFilterFactory extends BaseTokenFilterFactory {
+ private Transliterator transliterator;
+
+ // TODO: add support for custom rules
+ @Override
+ public void init(Map args) {
+ super.init(args);
+ String id = args.get("id");
+ if (id == null) {
+ throw new SolrException(ErrorCode.SERVER_ERROR, "id is required.");
+ }
+
+ int dir;
+ String direction = args.get("direction");
+ if (direction == null || direction.equalsIgnoreCase("forward"))
+ dir = Transliterator.FORWARD;
+ else if (direction.equalsIgnoreCase("reverse"))
+ dir = Transliterator.REVERSE;
+ else
+ throw new SolrException(ErrorCode.SERVER_ERROR, "invalid direction: " + direction);
+
+ transliterator = Transliterator.getInstance(id, dir);
+ }
+
+ public TokenStream create(TokenStream input) {
+ return new ICUTransformFilter(input, transliterator);
+ }
+}
diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
new file mode 100644
index 00000000000..44c42f6f2f6
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
@@ -0,0 +1,170 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.solr.common.ResourceLoader;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+
+public class TestICUCollationKeyFilterFactory extends BaseTokenTestCase {
+
+ /*
+ * Turkish has some funny casing.
+ * This test shows how you can solve this kind of thing easily with collation.
+ * Instead of using LowerCaseFilter, use a turkish collator with primary strength.
+ * Then things will sort and match correctly.
+ */
+ public void testBasicUsage() throws IOException {
+ String turkishUpperCase = "I WİLL USE TURKİSH CASING";
+ String turkishLowerCase = "ı will use turkish casıng";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map args = new HashMap();
+ args.put("locale", "tr");
+ args.put("strength", "primary");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsUpper = factory.create(
+ new KeywordTokenizer(new StringReader(turkishUpperCase)));
+ TokenStream tsLower = factory.create(
+ new KeywordTokenizer(new StringReader(turkishLowerCase)));
+ assertCollatesToSame(tsUpper, tsLower);
+ }
+
+ /*
+ * Test usage of the decomposition option for unicode normalization.
+ */
+ public void testNormalization() throws IOException {
+ String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
+ String turkishLowerCase = "ı will use turkish casıng";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map args = new HashMap();
+ args.put("locale", "tr");
+ args.put("strength", "primary");
+ args.put("decomposition", "canonical");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsUpper = factory.create(
+ new KeywordTokenizer(new StringReader(turkishUpperCase)));
+ TokenStream tsLower = factory.create(
+ new KeywordTokenizer(new StringReader(turkishLowerCase)));
+ assertCollatesToSame(tsUpper, tsLower);
+ }
+
+ /*
+ * Test secondary strength, for english case is not significant.
+ */
+ public void testSecondaryStrength() throws IOException {
+ String upperCase = "TESTING";
+ String lowerCase = "testing";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map args = new HashMap();
+ args.put("locale", "en");
+ args.put("strength", "secondary");
+ args.put("decomposition", "no");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsUpper = factory.create(
+ new KeywordTokenizer(new StringReader(upperCase)));
+ TokenStream tsLower = factory.create(
+ new KeywordTokenizer(new StringReader(lowerCase)));
+ assertCollatesToSame(tsUpper, tsLower);
+ }
+
+ /*
+ * For german, you might want oe to sort and match with o umlaut.
+ * This is not the default, but you can make a customized ruleset to do this.
+ *
+ * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
+ * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
+ */
+ public void testCustomRules() throws Exception {
+ RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
+
+ String DIN5007_2_tailorings =
+ "& ae , a\u0308 & AE , A\u0308"+
+ "& oe , o\u0308 & OE , O\u0308"+
+ "& ue , u\u0308 & UE , u\u0308";
+
+ RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
+ String tailoredRules = tailoredCollator.getRules();
+ //
+ // at this point, you would save these tailoredRules to a file,
+ // and use the custom parameter.
+ //
+ String germanUmlaut = "Töne";
+ String germanOE = "Toene";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map args = new HashMap();
+ args.put("custom", "rules.txt");
+ args.put("strength", "primary");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(tailoredRules));
+ TokenStream tsUmlaut = factory.create(
+ new KeywordTokenizer(new StringReader(germanUmlaut)));
+ TokenStream tsOE = factory.create(
+ new KeywordTokenizer(new StringReader(germanOE)));
+
+ assertCollatesToSame(tsUmlaut, tsOE);
+ }
+
+ private class StringMockSolrResourceLoader implements ResourceLoader {
+ String text;
+
+ StringMockSolrResourceLoader(String text) {
+ this.text = text;
+ }
+
+ public List getLines(String resource) throws IOException {
+ return null;
+ }
+
+ public Object newInstance(String cname, String... subpackages) {
+ return null;
+ }
+
+ public InputStream openResource(String resource) throws IOException {
+ return new ByteArrayInputStream(text.getBytes("UTF-8"));
+ }
+ }
+
+ private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
+ throws IOException {
+ CharTermAttribute term1 = stream1
+ .addAttribute(CharTermAttribute.class);
+ CharTermAttribute term2 = stream2
+ .addAttribute(CharTermAttribute.class);
+ assertTrue(stream1.incrementToken());
+ assertTrue(stream2.incrementToken());
+ assertEquals(term1.toString(), term2.toString());
+ assertFalse(stream1.incrementToken());
+ assertFalse(stream2.incrementToken());
+ }
+}
diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java
new file mode 100644
index 00000000000..5fc3d653f8a
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java
@@ -0,0 +1,39 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/** basic tests for {@link ICUFoldingFilterFactory} */
+public class TestICUFoldingFilterFactory extends BaseTokenTestCase {
+
+ /** basic tests to ensure the folding is working */
+ public void test() throws Exception {
+ Reader reader = new StringReader("Résumé");
+ ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory();
+ factory.init(DEFAULT_VERSION_PARAM);
+ Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+ TokenStream stream = factory.create(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "resume" });
+ }
+}
diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java
new file mode 100644
index 00000000000..200890d3298
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java
@@ -0,0 +1,41 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/** basic tests for {@link ICUNormalizer2FilterFactory} */
+public class TestICUNormalizer2FilterFactory extends BaseTokenTestCase {
+
+ /** Test nfkc_cf defaults */
+ public void testDefaults() throws Exception {
+ Reader reader = new StringReader("This is a Test");
+ ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory();
+ factory.init(DEFAULT_VERSION_PARAM);
+ Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+ TokenStream stream = factory.create(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" });
+ }
+
+ // TODO: add tests for different forms
+}
diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java
new file mode 100644
index 00000000000..8b6992ec0b6
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java
@@ -0,0 +1,35 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/** basic tests for {@link ICUTokenizerFactory} **/
+public class TestICUTokenizerFactory extends BaseTokenTestCase {
+ public void testMixedText() throws Exception {
+ Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ");
+ ICUTokenizerFactory factory = new ICUTokenizerFactory();
+ TokenStream stream = factory.create(reader);
+ assertTokenStreamContents(stream,
+ new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี",
+ "This", "is", "a", "test", "ກວ່າ", "ດອກ"});
+ }
+}
diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java
new file mode 100644
index 00000000000..9df2c998570
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java
@@ -0,0 +1,64 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/** basic tests for {@link ICUTransformFilterFactory} */
+public class TestICUTransformFilterFactory extends BaseTokenTestCase {
+
+ /** ensure the transform is working */
+ public void test() throws Exception {
+ Reader reader = new StringReader("簡化字");
+ ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
+ Map args = new HashMap();
+ args.put("id", "Traditional-Simplified");
+ factory.init(args);
+ Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+ TokenStream stream = factory.create(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "简化字" });
+ }
+
+ /** test forward and reverse direction */
+ public void testDirection() throws Exception {
+ // forward
+ Reader reader = new StringReader("Российская Федерация");
+ ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
+ Map args = new HashMap();
+ args.put("id", "Cyrillic-Latin");
+ factory.init(args);
+ Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+ TokenStream stream = factory.create(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "Rossijskaâ", "Federaciâ" });
+
+ // backward (invokes Latin-Cyrillic)
+ reader = new StringReader("Rossijskaâ Federaciâ");
+ args.put("direction", "reverse");
+ factory.init(args);
+ tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+ stream = factory.create(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "Российская", "Федерация" });
+ }
+}