LUCENE-8129: allow passing filtered unicode sets to ICUFoldingFilter

This commit is contained in:
Rob Muir 2018-01-16 12:41:31 -08:00
parent a6b5c5bfb0
commit 6781a0d2d3
4 changed files with 62 additions and 10 deletions

View File

@ -125,6 +125,9 @@ Improvements
* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir) * LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
* LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter.
(Ere Maijala)
Bug Fixes Bug Fixes
* LUCENE-8077: Fixed bug in how CheckIndex verifies doc-value iterators. * LUCENE-8077: Fixed bug in how CheckIndex verifies doc-value iterators.

View File

@ -59,11 +59,19 @@ import com.ibm.icu.text.Normalizer2;
* All foldings, case folding, and normalization mappings are applied recursively * All foldings, case folding, and normalization mappings are applied recursively
* to ensure a fully folded and normalized result. * to ensure a fully folded and normalized result.
* </p> * </p>
* <p>
* A normalizer with additional settings such as a filter that lists characters not
* to be normalized can be passed in the constructor.
* </p>
*/ */
public final class ICUFoldingFilter extends ICUNormalizer2Filter { public final class ICUFoldingFilter extends ICUNormalizer2Filter {
/**
* A normalizer for search term folding to Unicode text,
* applying foldings from UTR#30 Character Foldings.
*/
public static final Normalizer2 NORMALIZER = Normalizer2.getInstance(
// TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error. // TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
private static final Normalizer2 normalizer = Normalizer2.getInstance(
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
"utr30", Normalizer2.Mode.COMPOSE); "utr30", Normalizer2.Mode.COMPOSE);
@ -71,6 +79,14 @@ public final class ICUFoldingFilter extends ICUNormalizer2Filter {
* Create a new ICUFoldingFilter on the specified input * Create a new ICUFoldingFilter on the specified input
*/ */
public ICUFoldingFilter(TokenStream input) { public ICUFoldingFilter(TokenStream input) {
super(input, NORMALIZER);
}
/**
* Create a new ICUFoldingFilter on the specified input with the specified
* normalizer
*/
public ICUFoldingFilter(TokenStream input, Normalizer2 normalizer) {
super(input, normalizer); super(input, normalizer);
} }
} }

View File

@ -25,6 +25,10 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
import org.apache.lucene.analysis.util.MultiTermAwareComponent; import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory;
import com.ibm.icu.text.FilteredNormalizer2;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UnicodeSet;
/** /**
* Factory for {@link ICUFoldingFilter}. * Factory for {@link ICUFoldingFilter}.
* <pre class="prettyprint"> * <pre class="prettyprint">
@ -37,18 +41,30 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* @since 3.1.0 * @since 3.1.0
*/ */
public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
private final Normalizer2 normalizer;
/** Creates a new ICUFoldingFilterFactory */ /** Creates a new ICUFoldingFilterFactory */
public ICUFoldingFilterFactory(Map<String,String> args) { public ICUFoldingFilterFactory(Map<String,String> args) {
super(args); super(args);
Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER;
String filter = get(args, "filter");
if (filter != null) {
UnicodeSet set = new UnicodeSet(filter);
if (!set.isEmpty()) {
set.freeze();
normalizer = new FilteredNormalizer2(normalizer, set);
}
}
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
this.normalizer = normalizer;
} }
@Override @Override
public TokenStream create(TokenStream input) { public TokenStream create(TokenStream input) {
return new ICUFoldingFilter(input); return new ICUFoldingFilter(input, normalizer);
} }
@Override @Override

View File

@ -36,6 +36,23 @@ public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase {
assertTokenStreamContents(stream, new String[] { "resume" }); assertTokenStreamContents(stream, new String[] { "resume" });
} }
/** test to ensure the filter parameter is working */
public void testFilter() throws Exception {
HashMap<String,String> args = new HashMap<String,String>();
args.put("filter", "[^ö]");
ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory(args);
Reader reader = new StringReader("Résumé");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "resume" });
reader = new StringReader("Fönster");
stream = whitespaceMockTokenizer(reader);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "fönster" });
}
/** Test that bogus arguments result in exception */ /** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception { public void testBogusArguments() throws Exception {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {