[analysis-icu] Allow setting unicodeSetFilter (#20814)

UnicodeSetFilter was only allowed in the icu_folding token filter.
It seems useful to expose this setting in icu_normalizer token filter
and char filter.
This commit is contained in:
David Causse 2017-06-16 11:08:39 +02:00 committed by Adrien Grand
parent 9ddea539f5
commit ff9edb627e
5 changed files with 90 additions and 29 deletions

View File

@ -37,6 +37,10 @@ normalization can be specified with the `name` parameter, which accepts `nfc`,
`nfkc`, and `nfkc_cf` (default). Set the `mode` parameter to `decompose` to `nfkc`, and `nfkc_cf` (default). Set the `mode` parameter to `decompose` to
convert `nfc` to `nfd` or `nfkc` to `nfkd` respectively: convert `nfc` to `nfd` or `nfkc` to `nfkd` respectively:
Which letters are normalized can be controlled by specifying the
`unicodeSetFilter` parameter, which accepts a
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].
Here are two examples, the default usage and a customised character filter: Here are two examples, the default usage and a customised character filter:
@ -189,6 +193,10 @@ without any further configuration. The type of normalization can be specified
with the `name` parameter, which accepts `nfc`, `nfkc`, and `nfkc_cf` with the `name` parameter, which accepts `nfc`, `nfkc`, and `nfkc_cf`
(default). (default).
Which letters are normalized can be controlled by specifying the
`unicodeSetFilter` parameter, which accepts a
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].
You should probably prefer the <<analysis-icu-normalization-charfilter,Normalization character filter>>. You should probably prefer the <<analysis-icu-normalization-charfilter,Normalization character filter>>.
Here are two examples, the default usage and a customised token filter: Here are two examples, the default usage and a customised token filter:

View File

@ -19,9 +19,8 @@
package org.elasticsearch.index.analysis; package org.elasticsearch.index.analysis;
import com.ibm.icu.text.FilteredNormalizer2;
import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UnicodeSet;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.icu.ICUFoldingFilter; import org.apache.lucene.analysis.icu.ICUFoldingFilter;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
@ -41,31 +40,20 @@ import org.elasticsearch.index.IndexSettings;
* @author kimchy (shay.banon) * @author kimchy (shay.banon)
*/ */
public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
private final String unicodeSetFilter; /** Store here the same Normalizer used by the lucene ICUFoldingFilter */
private static final Normalizer2 ICU_FOLDING_NORMALIZER = Normalizer2.getInstance(
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), "utr30", Normalizer2.Mode.COMPOSE);
private final Normalizer2 normalizer;
public IcuFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { public IcuFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings); super(indexSettings, name, settings);
this.unicodeSetFilter = settings.get("unicodeSetFilter"); this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(ICU_FOLDING_NORMALIZER, settings);
} }
@Override @Override
public TokenStream create(TokenStream tokenStream) { public TokenStream create(TokenStream tokenStream) {
return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, normalizer);
// The ICUFoldingFilter is in fact implemented as a ICUNormalizer2Filter.
// ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here
if (unicodeSetFilter != null) {
Normalizer2 base = Normalizer2.getInstance(
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
"utr30", Normalizer2.Mode.COMPOSE);
UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);
unicodeSet.freeze();
Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet);
return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered);
}
else {
return new ICUFoldingFilter(tokenStream);
}
} }
@Override @Override

View File

@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis;
import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.Normalizer2;
import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter; import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
@ -33,22 +34,22 @@ import java.io.Reader;
* Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter} to normalize character. * Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter} to normalize character.
* <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p> * <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p>
* <p>The <tt>mode</tt> can be used to provide 'compose' or 'decompose'. Default is compose.</p> * <p>The <tt>mode</tt> can be used to provide 'compose' or 'decompose'. Default is compose.</p>
* <p>The <tt>unicodeSetFilter</tt> attribute can be used to provide the UniCodeSet for filtering.</p>
*/ */
public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory implements MultiTermAwareComponent { public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory implements MultiTermAwareComponent {
private final String name;
private final Normalizer2 normalizer; private final Normalizer2 normalizer;
public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name); super(indexSettings, name);
this.name = settings.get("name", "nfkc_cf"); String method = settings.get("name", "nfkc_cf");
String mode = settings.get("mode"); String mode = settings.get("mode");
if (!"compose".equals(mode) && !"decompose".equals(mode)) { if (!"compose".equals(mode) && !"decompose".equals(mode)) {
mode = "compose"; mode = "compose";
} }
this.normalizer = Normalizer2.getInstance( Normalizer2 normalizer = Normalizer2.getInstance(
null, this.name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE); null, method, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(normalizer, settings);
} }
@Override @Override

View File

@ -19,7 +19,10 @@
package org.elasticsearch.index.analysis; package org.elasticsearch.index.analysis;
import com.ibm.icu.text.FilteredNormalizer2;
import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UnicodeSet;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
@ -28,26 +31,40 @@ import org.elasticsearch.index.IndexSettings;
/** /**
* Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to normalize tokens. * Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to normalize tokens.
* <p>The <tt>name</tt> can be used to provide the type of normalization to perform. * <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p>
* <p>The <tt>unicodeSetFilter</tt> attribute can be used to provide the UniCodeSet for filtering.</p>
* *
* *
*/ */
public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
private final String name; private final Normalizer2 normalizer;
public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings); super(indexSettings, name, settings);
this.name = settings.get("name", "nfkc_cf"); String method = settings.get("name", "nfkc_cf");
Normalizer2 normalizer = Normalizer2.getInstance(null, method, Normalizer2.Mode.COMPOSE);
this.normalizer = wrapWithUnicodeSetFilter(normalizer, settings);
} }
@Override @Override
public TokenStream create(TokenStream tokenStream) { public TokenStream create(TokenStream tokenStream) {
return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE)); return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, normalizer);
} }
@Override @Override
public Object getMultiTermComponent() { public Object getMultiTermComponent() {
return this; return this;
} }
static Normalizer2 wrapWithUnicodeSetFilter(final Normalizer2 normalizer, Settings settings) {
String unicodeSetFilter = settings.get("unicodeSetFilter");
if (unicodeSetFilter != null) {
UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);
unicodeSet.freeze();
return new FilteredNormalizer2(normalizer, unicodeSet);
}
return normalizer;
}
} }

View File

@ -39,3 +39,50 @@
tokenizer: keyword tokenizer: keyword
- length: { tokens: 1 } - length: { tokens: 1 }
- match: { tokens.0.token: foo bar resume } - match: { tokens.0.token: foo bar resume }
---
"Normalization with a UnicodeSet Filter":
- do:
indices.create:
index: test
body:
settings:
index:
analysis:
char_filter:
charfilter_icu_normalizer:
type: icu_normalizer
unicodeSetFilter: "[^ß]"
filter:
tokenfilter_icu_normalizer:
type: icu_normalizer
unicodeSetFilter: "[^ßB]"
tokenfilter_icu_folding:
type: icu_folding
unicodeSetFilter: "[^â]"
- do:
indices.analyze:
index: test
body:
char_filter: ["charfilter_icu_normalizer"]
tokenizer: keyword
text: charfilter Föo Bâr Ruß
- length: { tokens: 1 }
- match: { tokens.0.token: charfilter föo bâr ruß }
- do:
indices.analyze:
index: test
body:
tokenizer: keyword
filter: ["tokenfilter_icu_normalizer"]
text: tokenfilter Föo Bâr Ruß
- length: { tokens: 1 }
- match: { tokens.0.token: tokenfilter föo Bâr ruß }
- do:
indices.analyze:
index: test
body:
tokenizer: keyword
filter: ["tokenfilter_icu_folding"]
text: icufolding Föo Bâr Ruß
- length: { tokens: 1 }
- match: { tokens.0.token: icufolding foo bâr russ }