diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilterFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilterFactory.java new file mode 100644 index 00000000000..75f0206c451 --- /dev/null +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilterFactory.java @@ -0,0 +1,83 @@ +package org.apache.lucene.analysis.icu; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.util.Arrays; +import java.util.Map; + +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.CharFilterFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; + +import com.ibm.icu.text.FilteredNormalizer2; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.UnicodeSet; + +/** + * Factory for {@link ICUNormalizer2CharFilter} + *

+ * Supports the following attributes: + *

+ * @see ICUNormalizer2CharFilter + * @see Normalizer2 + * @see FilteredNormalizer2 + */ +public class ICUNormalizer2CharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent { + private final Normalizer2 normalizer; + + /** Creates a new ICUNormalizer2CharFilterFactory */ + public ICUNormalizer2CharFilterFactory(Map args) { + super(args); + String name = get(args, "name", "nfkc_cf"); + String mode = get(args, "mode", Arrays.asList("compose", "decompose"), "compose"); + Normalizer2 normalizer = Normalizer2.getInstance + (null, name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE); + + String filter = get(args, "filter"); + if (filter != null) { + UnicodeSet set = new UnicodeSet(filter); + if (!set.isEmpty()) { + set.freeze(); + normalizer = new FilteredNormalizer2(normalizer, set); + } + } + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + this.normalizer = normalizer; + } + + @Override + public Reader create(Reader input) { + return new ICUNormalizer2CharFilter(input, normalizer); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } + +} diff --git a/lucene/analysis/icu/src/resources/META-INF/services/org.apache.lucene.analysis.util.CharFilterFactory b/lucene/analysis/icu/src/resources/META-INF/services/org.apache.lucene.analysis.util.CharFilterFactory new file mode 100644 index 00000000000..d47f7a0a915 --- /dev/null +++ b/lucene/analysis/icu/src/resources/META-INF/services/org.apache.lucene.analysis.util.CharFilterFactory @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.icu.ICUNormalizer2CharFilterFactory diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilterFactory.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilterFactory.java new file mode 100644 index 00000000000..eee19a8bffa --- /dev/null +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilterFactory.java @@ -0,0 +1,52 @@ +package org.apache.lucene.analysis.icu; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; + +/** basic tests for {@link ICUNormalizer2CharFilterFactory} */ +public class TestICUNormalizer2CharFilterFactory extends BaseTokenStreamTestCase { + + /** Test nfkc_cf defaults */ + public void testDefaults() throws Exception { + Reader reader = new StringReader("This is a Test"); + ICUNormalizer2CharFilterFactory factory = new ICUNormalizer2CharFilterFactory(new HashMap()); + reader = factory.create(reader); + TokenStream stream = whitespaceMockTokenizer(reader); + assertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" }); + } + + /** Test that bogus arguments result in exception */ + public void testBogusArguments() throws Exception { + try { + new ICUNormalizer2CharFilterFactory(new HashMap() {{ + put("bogusArg", "bogusValue"); + }}); + fail(); + } catch (IllegalArgumentException expected) { + assertTrue(expected.getMessage().contains("Unknown parameters")); + } + } + + // TODO: add tests for different forms +}