LUCENE-8129: allow passing filtered unicode sets to ICUFoldingFilter

2018-01-16 12:41:31 -08:00 · 2018-01-16 12:41:31 -08:00 · 6781a0d2d3
parent a6b5c5bfb0
commit 6781a0d2d3
4 changed files with 62 additions and 10 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -125,6 +125,9 @@ Improvements
 * LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
 * LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter.
  (Ere Maijala)
 Bug Fixes
 * LUCENE-8077: Fixed bug in how CheckIndex verifies doc-value iterators.
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
@ -59,18 +59,34 @@ import com.ibm.icu.text.Normalizer2;
 * All foldings, case folding, and normalization mappings are applied recursively
 * to ensure a fully folded and normalized result.
 * </p>
 * <p>
 * A normalizer with additional settings such as a filter that lists characters not
 * to be normalized can be passed in the constructor.
 * </p>
 */
 public final class ICUFoldingFilter extends ICUNormalizer2Filter {
-  // TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
+  /**
-  // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
+   * A normalizer for search term folding to Unicode text,
-  private static final Normalizer2 normalizer =  Normalizer2.getInstance(
+   * applying foldings from UTR#30 Character Foldings.
-      ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), 
+   */
-      "utr30", Normalizer2.Mode.COMPOSE);
+  public static final Normalizer2 NORMALIZER = Normalizer2.getInstance(
-  
+    // TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
    // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
    ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
    "utr30", Normalizer2.Mode.COMPOSE);
  /**
   * Create a new ICUFoldingFilter on the specified input
   */
  public ICUFoldingFilter(TokenStream input) {
    super(input, NORMALIZER);
  }
  /**
   * Create a new ICUFoldingFilter on the specified input with the specified
   * normalizer
   */
  public ICUFoldingFilter(TokenStream input, Normalizer2 normalizer) {
    super(input, normalizer);
  }
 }
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java
@ -25,7 +25,11 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
 import org.apache.lucene.analysis.util.MultiTermAwareComponent;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
-/** 
+import com.ibm.icu.text.FilteredNormalizer2;
 import com.ibm.icu.text.Normalizer2;
 import com.ibm.icu.text.UnicodeSet;
 /**
 * Factory for {@link ICUFoldingFilter}.
 * <pre class="prettyprint">
 * &lt;fieldType name="text_folded" class="solr.TextField" positionIncrementGap="100"&gt;
@ -37,18 +41,30 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
 * @since 3.1.0
 */
 public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
  private final Normalizer2 normalizer;
  /** Creates a new ICUFoldingFilterFactory */
  public ICUFoldingFilterFactory(Map<String,String> args) {
    super(args);
    Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER;
    String filter = get(args, "filter");
    if (filter != null) {
      UnicodeSet set = new UnicodeSet(filter);
      if (!set.isEmpty()) {
        set.freeze();
        normalizer = new FilteredNormalizer2(normalizer, set);
      }
    }
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
    this.normalizer = normalizer;
  }
  @Override
  public TokenStream create(TokenStream input) {
-    return new ICUFoldingFilter(input);
+    return new ICUFoldingFilter(input, normalizer);
  }
  @Override
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.TokenStream;
 /** basic tests for {@link ICUFoldingFilterFactory} */
 public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase {
-  
+
  /** basic tests to ensure the folding is working */
  public void test() throws Exception {
    Reader reader = new StringReader("Résumé");
@ -35,7 +35,24 @@ public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase {
    stream = factory.create(stream);
    assertTokenStreamContents(stream, new String[] { "resume" });
  }
-  
+
  /** test to ensure the filter parameter is working */
  public void testFilter() throws Exception {
    HashMap<String,String> args = new HashMap<String,String>();
    args.put("filter", "[^ö]");
    ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory(args);
    Reader reader = new StringReader("Résumé");
    TokenStream stream = whitespaceMockTokenizer(reader);
    stream = factory.create(stream);
    assertTokenStreamContents(stream, new String[] { "resume" });
    reader = new StringReader("Fönster");
    stream = whitespaceMockTokenizer(reader);
    stream = factory.create(stream);
    assertTokenStreamContents(stream, new String[] { "fönster" });
  }
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {