mirror of https://github.com/apache/lucene.git
LUCENE-8129: allow passing filtered unicode sets to ICUFoldingFilter
This commit is contained in:
parent
a6b5c5bfb0
commit
6781a0d2d3
|
@ -125,6 +125,9 @@ Improvements
|
||||||
|
|
||||||
* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
|
* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter.
|
||||||
|
(Ere Maijala)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
||||||
* LUCENE-8077: Fixed bug in how CheckIndex verifies doc-value iterators.
|
* LUCENE-8077: Fixed bug in how CheckIndex verifies doc-value iterators.
|
||||||
|
|
|
@ -59,18 +59,34 @@ import com.ibm.icu.text.Normalizer2;
|
||||||
* All foldings, case folding, and normalization mappings are applied recursively
|
* All foldings, case folding, and normalization mappings are applied recursively
|
||||||
* to ensure a fully folded and normalized result.
|
* to ensure a fully folded and normalized result.
|
||||||
* </p>
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* A normalizer with additional settings such as a filter that lists characters not
|
||||||
|
* to be normalized can be passed in the constructor.
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
public final class ICUFoldingFilter extends ICUNormalizer2Filter {
|
public final class ICUFoldingFilter extends ICUNormalizer2Filter {
|
||||||
// TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
|
/**
|
||||||
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
|
* A normalizer for search term folding to Unicode text,
|
||||||
private static final Normalizer2 normalizer = Normalizer2.getInstance(
|
* applying foldings from UTR#30 Character Foldings.
|
||||||
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
|
*/
|
||||||
"utr30", Normalizer2.Mode.COMPOSE);
|
public static final Normalizer2 NORMALIZER = Normalizer2.getInstance(
|
||||||
|
// TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
|
||||||
|
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
|
||||||
|
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
|
||||||
|
"utr30", Normalizer2.Mode.COMPOSE);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new ICUFoldingFilter on the specified input
|
* Create a new ICUFoldingFilter on the specified input
|
||||||
*/
|
*/
|
||||||
public ICUFoldingFilter(TokenStream input) {
|
public ICUFoldingFilter(TokenStream input) {
|
||||||
|
super(input, NORMALIZER);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new ICUFoldingFilter on the specified input with the specified
|
||||||
|
* normalizer
|
||||||
|
*/
|
||||||
|
public ICUFoldingFilter(TokenStream input, Normalizer2 normalizer) {
|
||||||
super(input, normalizer);
|
super(input, normalizer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,10 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
|
||||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
import com.ibm.icu.text.FilteredNormalizer2;
|
||||||
|
import com.ibm.icu.text.Normalizer2;
|
||||||
|
import com.ibm.icu.text.UnicodeSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory for {@link ICUFoldingFilter}.
|
* Factory for {@link ICUFoldingFilter}.
|
||||||
* <pre class="prettyprint">
|
* <pre class="prettyprint">
|
||||||
|
@ -37,18 +41,30 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
* @since 3.1.0
|
* @since 3.1.0
|
||||||
*/
|
*/
|
||||||
public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||||
|
private final Normalizer2 normalizer;
|
||||||
|
|
||||||
/** Creates a new ICUFoldingFilterFactory */
|
/** Creates a new ICUFoldingFilterFactory */
|
||||||
public ICUFoldingFilterFactory(Map<String,String> args) {
|
public ICUFoldingFilterFactory(Map<String,String> args) {
|
||||||
super(args);
|
super(args);
|
||||||
|
|
||||||
|
Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER;
|
||||||
|
String filter = get(args, "filter");
|
||||||
|
if (filter != null) {
|
||||||
|
UnicodeSet set = new UnicodeSet(filter);
|
||||||
|
if (!set.isEmpty()) {
|
||||||
|
set.freeze();
|
||||||
|
normalizer = new FilteredNormalizer2(normalizer, set);
|
||||||
|
}
|
||||||
|
}
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
this.normalizer = normalizer;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
return new ICUFoldingFilter(input);
|
return new ICUFoldingFilter(input, normalizer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -36,6 +36,23 @@ public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase {
|
||||||
assertTokenStreamContents(stream, new String[] { "resume" });
|
assertTokenStreamContents(stream, new String[] { "resume" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** test to ensure the filter parameter is working */
|
||||||
|
public void testFilter() throws Exception {
|
||||||
|
HashMap<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("filter", "[^ö]");
|
||||||
|
ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory(args);
|
||||||
|
|
||||||
|
Reader reader = new StringReader("Résumé");
|
||||||
|
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||||
|
stream = factory.create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "resume" });
|
||||||
|
|
||||||
|
reader = new StringReader("Fönster");
|
||||||
|
stream = whitespaceMockTokenizer(reader);
|
||||||
|
stream = factory.create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "fönster" });
|
||||||
|
}
|
||||||
|
|
||||||
/** Test that bogus arguments result in exception */
|
/** Test that bogus arguments result in exception */
|
||||||
public void testBogusArguments() throws Exception {
|
public void testBogusArguments() throws Exception {
|
||||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||||
|
|
Loading…
Reference in New Issue