From 5c6b4f4f65026ecf8dbe3a5390966262178d6b18 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 2 Nov 2010 12:03:18 +0000 Subject: [PATCH] SOLR-2210: add factories for icu analyzers git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1030012 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 2 + solr/build.xml | 13 +- solr/common-build.xml | 3 + solr/contrib/analysis-extras/README.txt | 16 ++ solr/contrib/analysis-extras/build.xml | 203 ++++++++++++++++++ .../analysis-extras/lib/icu4j-4_4_2.jar | 2 + .../ICUCollationKeyFilterFactory.java | 142 ++++++++++++ .../analysis/ICUFoldingFilterFactory.java | 30 +++ .../analysis/ICUNormalizer2FilterFactory.java | 81 +++++++ .../solr/analysis/ICUTokenizerFactory.java | 32 +++ .../analysis/ICUTransformFilterFactory.java | 67 ++++++ .../TestICUCollationKeyFilterFactory.java | 170 +++++++++++++++ .../analysis/TestICUFoldingFilterFactory.java | 39 ++++ .../TestICUNormalizer2FilterFactory.java | 41 ++++ .../analysis/TestICUTokenizerFactory.java | 35 +++ .../TestICUTransformFilterFactory.java | 64 ++++++ 16 files changed, 936 insertions(+), 4 deletions(-) create mode 100644 solr/contrib/analysis-extras/README.txt create mode 100644 solr/contrib/analysis-extras/build.xml create mode 100644 solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar create mode 100644 solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java create mode 100644 solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java create mode 100644 solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java create mode 100644 solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java create mode 100644 solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java create mode 100644 solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java create mode 100644 solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java create mode 100644 solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java create mode 100644 solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java create mode 100644 solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 4c69968943b..d5b6701de7d 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -297,6 +297,8 @@ New Features built-in load balancing, and infrastructure for future SolrCloud work. (yonik, Mark Miller) +* SOLR-2210: Add icu-based tokenizer and filters to contrib/analysis-extras (rmuir) + Optimizations ---------------------- diff --git a/solr/build.xml b/solr/build.xml index b1798cb80f7..81e66356c10 100644 --- a/solr/build.xml +++ b/solr/build.xml @@ -34,9 +34,6 @@ - - - + @@ -509,6 +507,7 @@ + @@ -609,6 +608,8 @@ basedir="contrib/extraction/src" /> + --> + @@ -721,7 +724,7 @@ + excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/** contrib/analysis-extras/lib/**" /> @@ -952,6 +955,8 @@ + + diff --git a/solr/common-build.xml b/solr/common-build.xml index de0aacae36f..d95925c2d26 100644 --- a/solr/common-build.xml +++ b/solr/common-build.xml @@ -23,6 +23,9 @@ + + + diff --git a/solr/contrib/analysis-extras/README.txt b/solr/contrib/analysis-extras/README.txt new file mode 100644 index 00000000000..2c60e0e7917 --- /dev/null +++ b/solr/contrib/analysis-extras/README.txt @@ -0,0 +1,16 @@ +The analysis-extras plugin provides additional analyzers that rely +upon large dependencies/dictionaries. + +It includes integration with ICU for multilingual support, and +analyzers for Chinese and Polish. + +Relies upon the following lucene components (in lucene-libs/): + + * lucene-analyzers-icu-X.Y.jar + * lucene-analyzers-smartcn-X.Y.jar + * lucene-analyzers-stempel-X.Y.jar + +And the ICU library (in lib/): + + * icu4j-X.Y.jar + \ No newline at end of file diff --git a/solr/contrib/analysis-extras/build.xml b/solr/contrib/analysis-extras/build.xml new file mode 100644 index 00000000000..af4a13da4b2 --- /dev/null +++ b/solr/contrib/analysis-extras/build.xml @@ -0,0 +1,203 @@ + + + + + + + + + + + + Additional analysis components + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tests failed! + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar b/solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar new file mode 100644 index 00000000000..3120f680cb3 --- /dev/null +++ b/solr/contrib/analysis-extras/lib/icu4j-4_4_2.jar @@ -0,0 +1,2 @@ +AnyObjectId[4d9d4e1277822f7a08dd9469ae2ca81d44902552] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java new file mode 100644 index 00000000000..1a79de899f6 --- /dev/null +++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java @@ -0,0 +1,142 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.InputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.collation.ICUCollationKeyFilter; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; +import org.apache.solr.util.plugin.ResourceLoaderAware; + +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.util.ULocale; + +/** + * Factory for {@link ICUCollationKeyFilter}. + *

+ * This factory can be created in two ways: + *

    + *
  • Based upon a system collator associated with a Locale. + *
  • Based upon a tailored ruleset. + *
+ *

+ * Using a System collator: + *

    + *
  • locale: RFC 3066 locale ID (mandatory) + *
  • strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional) + *
  • decomposition: 'no', or 'canonical' (optional) + *
+ *

+ * Using a Tailored ruleset: + *

    + *
  • custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory) + *
  • strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional) + *
  • decomposition: 'no' or 'canonical' (optional) + *
+ * + * @see Collator + * @see ULocale + * @see RuleBasedCollator + */ +public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { + private Collator collator; + + public void inform(ResourceLoader loader) { + String custom = args.get("custom"); + String localeID = args.get("locale"); + String strength = args.get("strength"); + String decomposition = args.get("decomposition"); + + if (custom == null && localeID == null) + throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required."); + + if (custom != null && localeID != null) + throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. " + + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. " + + "Then save the entire customized ruleset to a file, and use with the custom parameter"); + + if (localeID != null) { + // create from a system collator, based on Locale. + collator = createFromLocale(localeID); + } else { + // create from a custom ruleset + collator = createFromRules(custom, loader); + } + + // set the strength flag, otherwise it will be the default. + if (strength != null) { + if (strength.equalsIgnoreCase("primary")) + collator.setStrength(Collator.PRIMARY); + else if (strength.equalsIgnoreCase("secondary")) + collator.setStrength(Collator.SECONDARY); + else if (strength.equalsIgnoreCase("tertiary")) + collator.setStrength(Collator.TERTIARY); + else if (strength.equalsIgnoreCase("quaternary")) + collator.setStrength(Collator.QUATERNARY); + else if (strength.equalsIgnoreCase("identical")) + collator.setStrength(Collator.IDENTICAL); + else + throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength); + } + + // set the decomposition flag, otherwise it will be the default. + if (decomposition != null) { + if (decomposition.equalsIgnoreCase("no")) + collator.setDecomposition(Collator.NO_DECOMPOSITION); + else if (decomposition.equalsIgnoreCase("canonical")) + collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); + else + throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition); + } + } + + public TokenStream create(TokenStream input) { + return new ICUCollationKeyFilter(input, collator); + } + + /* + * Create a locale from localeID. + * Then return the appropriate collator for the locale. + */ + private Collator createFromLocale(String localeID) { + return Collator.getInstance(new ULocale(localeID)); + } + + /* + * Read custom rules from a file, and create a RuleBasedCollator + * The file cannot support comments, as # might be in the rules! + */ + private Collator createFromRules(String fileName, ResourceLoader loader) { + InputStream input = null; + try { + input = loader.openResource(fileName); + String rules = IOUtils.toString(input, "UTF-8"); + return new RuleBasedCollator(rules); + } catch (Exception e) { + // io error or invalid rules + throw new RuntimeException(e); + } finally { + IOUtils.closeQuietly(input); + } + } +} diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java new file mode 100644 index 00000000000..c0aa1fbd186 --- /dev/null +++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java @@ -0,0 +1,30 @@ +package org.apache.solr.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.icu.ICUFoldingFilter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Factory for {@link ICUFoldingFilter} */ +public class ICUFoldingFilterFactory extends BaseTokenFilterFactory { + + @Override + public TokenStream create(TokenStream input) { + return new ICUFoldingFilter(input); + } +} diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java new file mode 100644 index 00000000000..860a5c53d9a --- /dev/null +++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java @@ -0,0 +1,81 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.icu.ICUNormalizer2Filter; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; + +import com.ibm.icu.text.FilteredNormalizer2; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.UnicodeSet; + +/** + * Factory for {@link ICUNormalizer2Filter} + *

+ * Supports the following attributes: + *

    + *
  • name: A Unicode Normalization Form, + * one of 'nfc','nfkc', 'nfkc_cf'. Default is nfkc_cf. + *
  • mode: Either 'compose' or 'decompose'. Default is compose. Use "decompose" with nfc + * or nfkc, to get nfd or nfkd, respectively. + *
  • filter: A {@link UnicodeSet} pattern. Codepoints outside the set are + * always left unchanged. Default is [] (the null set, no filtering). + *
+ * @see ICUNormalizer2Filter + * @see Normalizer2 + * @see FilteredNormalizer2 + */ +public class ICUNormalizer2FilterFactory extends BaseTokenFilterFactory { + private Normalizer2 normalizer; + + // TODO: support custom normalization + @Override + public void init(Map args) { + super.init(args); + String name = args.get("name"); + if (name == null) + name = "nfkc_cf"; + String mode = args.get("mode"); + if (mode == null) + mode = "compose"; + + if (mode.equals("compose")) + normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE); + else if (mode.equals("decompose")) + normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.DECOMPOSE); + else + throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid mode: " + mode); + + String filter = args.get("filter"); + if (filter != null) { + UnicodeSet set = new UnicodeSet(filter); + if (!set.isEmpty()) { + set.freeze(); + normalizer = new FilteredNormalizer2(normalizer, set); + } + } + } + + public TokenStream create(TokenStream input) { + return new ICUNormalizer2Filter(input, normalizer); + } +} diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java new file mode 100644 index 00000000000..bbda76fb291 --- /dev/null +++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java @@ -0,0 +1,32 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; + +/** Factory for {@link ICUTokenizer} */ +public class ICUTokenizerFactory extends BaseTokenizerFactory { + // TODO: add support for custom configs + @Override + public Tokenizer create(Reader input) { + return new ICUTokenizer(input); + } +} diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java new file mode 100644 index 00000000000..449bd2055ae --- /dev/null +++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java @@ -0,0 +1,67 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.icu.ICUTransformFilter; +import org.apache.solr.analysis.BaseTokenFilterFactory; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; + +import com.ibm.icu.text.Transliterator; + +/** + * Factory for {@link ICUTransformFilter}. + *

+ * Supports the following attributes: + *

    + *
  • id (mandatory): A Transliterator ID, one from {@link Transliterator#getAvailableIDs()} + *
  • direction (optional): Either 'forward' or 'reverse'. Default is forward. + *
+ * @see Transliterator + */ +public class ICUTransformFilterFactory extends BaseTokenFilterFactory { + private Transliterator transliterator; + + // TODO: add support for custom rules + @Override + public void init(Map args) { + super.init(args); + String id = args.get("id"); + if (id == null) { + throw new SolrException(ErrorCode.SERVER_ERROR, "id is required."); + } + + int dir; + String direction = args.get("direction"); + if (direction == null || direction.equalsIgnoreCase("forward")) + dir = Transliterator.FORWARD; + else if (direction.equalsIgnoreCase("reverse")) + dir = Transliterator.REVERSE; + else + throw new SolrException(ErrorCode.SERVER_ERROR, "invalid direction: " + direction); + + transliterator = Transliterator.getInstance(id, dir); + } + + public TokenStream create(TokenStream input) { + return new ICUTransformFilter(input, transliterator); + } +} diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java new file mode 100644 index 00000000000..44c42f6f2f6 --- /dev/null +++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java @@ -0,0 +1,170 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.solr.common.ResourceLoader; + +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.util.ULocale; + +public class TestICUCollationKeyFilterFactory extends BaseTokenTestCase { + + /* + * Turkish has some funny casing. + * This test shows how you can solve this kind of thing easily with collation. + * Instead of using LowerCaseFilter, use a turkish collator with primary strength. + * Then things will sort and match correctly. + */ + public void testBasicUsage() throws IOException { + String turkishUpperCase = "I WİLL USE TURKİSH CASING"; + String turkishLowerCase = "ı will use turkish casıng"; + ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory(); + Map args = new HashMap(); + args.put("locale", "tr"); + args.put("strength", "primary"); + factory.init(args); + factory.inform(new StringMockSolrResourceLoader("")); + TokenStream tsUpper = factory.create( + new KeywordTokenizer(new StringReader(turkishUpperCase))); + TokenStream tsLower = factory.create( + new KeywordTokenizer(new StringReader(turkishLowerCase))); + assertCollatesToSame(tsUpper, tsLower); + } + + /* + * Test usage of the decomposition option for unicode normalization. + */ + public void testNormalization() throws IOException { + String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING"; + String turkishLowerCase = "ı will use turkish casıng"; + ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory(); + Map args = new HashMap(); + args.put("locale", "tr"); + args.put("strength", "primary"); + args.put("decomposition", "canonical"); + factory.init(args); + factory.inform(new StringMockSolrResourceLoader("")); + TokenStream tsUpper = factory.create( + new KeywordTokenizer(new StringReader(turkishUpperCase))); + TokenStream tsLower = factory.create( + new KeywordTokenizer(new StringReader(turkishLowerCase))); + assertCollatesToSame(tsUpper, tsLower); + } + + /* + * Test secondary strength, for english case is not significant. + */ + public void testSecondaryStrength() throws IOException { + String upperCase = "TESTING"; + String lowerCase = "testing"; + ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory(); + Map args = new HashMap(); + args.put("locale", "en"); + args.put("strength", "secondary"); + args.put("decomposition", "no"); + factory.init(args); + factory.inform(new StringMockSolrResourceLoader("")); + TokenStream tsUpper = factory.create( + new KeywordTokenizer(new StringReader(upperCase))); + TokenStream tsLower = factory.create( + new KeywordTokenizer(new StringReader(lowerCase))); + assertCollatesToSame(tsUpper, tsLower); + } + + /* + * For german, you might want oe to sort and match with o umlaut. + * This is not the default, but you can make a customized ruleset to do this. + * + * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior. + * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383 + */ + public void testCustomRules() throws Exception { + RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE")); + + String DIN5007_2_tailorings = + "& ae , a\u0308 & AE , A\u0308"+ + "& oe , o\u0308 & OE , O\u0308"+ + "& ue , u\u0308 & UE , u\u0308"; + + RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); + String tailoredRules = tailoredCollator.getRules(); + // + // at this point, you would save these tailoredRules to a file, + // and use the custom parameter. + // + String germanUmlaut = "Töne"; + String germanOE = "Toene"; + ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory(); + Map args = new HashMap(); + args.put("custom", "rules.txt"); + args.put("strength", "primary"); + factory.init(args); + factory.inform(new StringMockSolrResourceLoader(tailoredRules)); + TokenStream tsUmlaut = factory.create( + new KeywordTokenizer(new StringReader(germanUmlaut))); + TokenStream tsOE = factory.create( + new KeywordTokenizer(new StringReader(germanOE))); + + assertCollatesToSame(tsUmlaut, tsOE); + } + + private class StringMockSolrResourceLoader implements ResourceLoader { + String text; + + StringMockSolrResourceLoader(String text) { + this.text = text; + } + + public List getLines(String resource) throws IOException { + return null; + } + + public Object newInstance(String cname, String... subpackages) { + return null; + } + + public InputStream openResource(String resource) throws IOException { + return new ByteArrayInputStream(text.getBytes("UTF-8")); + } + } + + private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) + throws IOException { + CharTermAttribute term1 = stream1 + .addAttribute(CharTermAttribute.class); + CharTermAttribute term2 = stream2 + .addAttribute(CharTermAttribute.class); + assertTrue(stream1.incrementToken()); + assertTrue(stream2.incrementToken()); + assertEquals(term1.toString(), term2.toString()); + assertFalse(stream1.incrementToken()); + assertFalse(stream2.incrementToken()); + } +} diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java new file mode 100644 index 00000000000..5fc3d653f8a --- /dev/null +++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java @@ -0,0 +1,39 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** basic tests for {@link ICUFoldingFilterFactory} */ +public class TestICUFoldingFilterFactory extends BaseTokenTestCase { + + /** basic tests to ensure the folding is working */ + public void test() throws Exception { + Reader reader = new StringReader("Résumé"); + ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory(); + factory.init(DEFAULT_VERSION_PARAM); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "resume" }); + } +} diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java new file mode 100644 index 00000000000..200890d3298 --- /dev/null +++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java @@ -0,0 +1,41 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** basic tests for {@link ICUNormalizer2FilterFactory} */ +public class TestICUNormalizer2FilterFactory extends BaseTokenTestCase { + + /** Test nfkc_cf defaults */ + public void testDefaults() throws Exception { + Reader reader = new StringReader("This is a Test"); + ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory(); + factory.init(DEFAULT_VERSION_PARAM); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" }); + } + + // TODO: add tests for different forms +} diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java new file mode 100644 index 00000000000..8b6992ec0b6 --- /dev/null +++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java @@ -0,0 +1,35 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; + +/** basic tests for {@link ICUTokenizerFactory} **/ +public class TestICUTokenizerFactory extends BaseTokenTestCase { + public void testMixedText() throws Exception { + Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ"); + ICUTokenizerFactory factory = new ICUTokenizerFactory(); + TokenStream stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", + "This", "is", "a", "test", "ກວ່າ", "ດອກ"}); + } +} diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java new file mode 100644 index 00000000000..9df2c998570 --- /dev/null +++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java @@ -0,0 +1,64 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** basic tests for {@link ICUTransformFilterFactory} */ +public class TestICUTransformFilterFactory extends BaseTokenTestCase { + + /** ensure the transform is working */ + public void test() throws Exception { + Reader reader = new StringReader("簡化字"); + ICUTransformFilterFactory factory = new ICUTransformFilterFactory(); + Map args = new HashMap(); + args.put("id", "Traditional-Simplified"); + factory.init(args); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "简化字" }); + } + + /** test forward and reverse direction */ + public void testDirection() throws Exception { + // forward + Reader reader = new StringReader("Российская Федерация"); + ICUTransformFilterFactory factory = new ICUTransformFilterFactory(); + Map args = new HashMap(); + args.put("id", "Cyrillic-Latin"); + factory.init(args); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "Rossijskaâ", "Federaciâ" }); + + // backward (invokes Latin-Cyrillic) + reader = new StringReader("Rossijskaâ Federaciâ"); + args.put("direction", "reverse"); + factory.init(args); + tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); + stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "Российская", "Федерация" }); + } +}