From 6781a0d2d3113e4f423bf717e9c8f781374265ca Mon Sep 17 00:00:00 2001
From: Rob Muir
+ * A normalizer with additional settings such as a filter that lists characters not + * to be normalized can be passed in the constructor. + *
*/ public final class ICUFoldingFilter extends ICUNormalizer2Filter { - // TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error. - // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html - private static final Normalizer2 normalizer = Normalizer2.getInstance( - ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), - "utr30", Normalizer2.Mode.COMPOSE); - + /** + * A normalizer for search term folding to Unicode text, + * applying foldings from UTR#30 Character Foldings. + */ + public static final Normalizer2 NORMALIZER = Normalizer2.getInstance( + // TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error. + // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html + ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), + "utr30", Normalizer2.Mode.COMPOSE); + /** * Create a new ICUFoldingFilter on the specified input */ public ICUFoldingFilter(TokenStream input) { + super(input, NORMALIZER); + } + + /** + * Create a new ICUFoldingFilter on the specified input with the specified + * normalizer + */ + public ICUFoldingFilter(TokenStream input, Normalizer2 normalizer) { super(input, normalizer); } } diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java index 036874ac9ff..1065cbfac81 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java @@ -25,7 +25,11 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs import org.apache.lucene.analysis.util.MultiTermAwareComponent; import org.apache.lucene.analysis.util.TokenFilterFactory; -/** +import com.ibm.icu.text.FilteredNormalizer2; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.UnicodeSet; + +/** * Factory for {@link ICUFoldingFilter}. ** <fieldType name="text_folded" class="solr.TextField" positionIncrementGap="100"> @@ -37,18 +41,30 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; * @since 3.1.0 */ public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + private final Normalizer2 normalizer; /** Creates a new ICUFoldingFilterFactory */ public ICUFoldingFilterFactory(Mapargs) { super(args); + + Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER; + String filter = get(args, "filter"); + if (filter != null) { + UnicodeSet set = new UnicodeSet(filter); + if (!set.isEmpty()) { + set.freeze(); + normalizer = new FilteredNormalizer2(normalizer, set); + } + } if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } + this.normalizer = normalizer; } @Override public TokenStream create(TokenStream input) { - return new ICUFoldingFilter(input); + return new ICUFoldingFilter(input, normalizer); } @Override diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java index 3782216d38c..3e3c5235791 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java @@ -26,7 +26,7 @@ import org.apache.lucene.analysis.TokenStream; /** basic tests for {@link ICUFoldingFilterFactory} */ public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase { - + /** basic tests to ensure the folding is working */ public void test() throws Exception { Reader reader = new StringReader("Résumé"); @@ -35,7 +35,24 @@ public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase { stream = factory.create(stream); assertTokenStreamContents(stream, new String[] { "resume" }); } - + + /** test to ensure the filter parameter is working */ + public void testFilter() throws Exception { + HashMap args = new HashMap (); + args.put("filter", "[^ö]"); + ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory(args); + + Reader reader = new StringReader("Résumé"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = factory.create(stream); + assertTokenStreamContents(stream, new String[] { "resume" }); + + reader = new StringReader("Fönster"); + stream = whitespaceMockTokenizer(reader); + stream = factory.create(stream); + assertTokenStreamContents(stream, new String[] { "fönster" }); + } + /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {