[analysis-icu] Allow setting unicodeSetFilter (#20814)
UnicodeSetFilter was only allowed in the icu_folding token filter. It seems useful to expose this setting in icu_normalizer token filter and char filter.
This commit is contained in:
parent
9ddea539f5
commit
ff9edb627e
|
@ -37,6 +37,10 @@ normalization can be specified with the `name` parameter, which accepts `nfc`,
|
||||||
`nfkc`, and `nfkc_cf` (default). Set the `mode` parameter to `decompose` to
|
`nfkc`, and `nfkc_cf` (default). Set the `mode` parameter to `decompose` to
|
||||||
convert `nfc` to `nfd` or `nfkc` to `nfkd` respectively:
|
convert `nfc` to `nfd` or `nfkc` to `nfkd` respectively:
|
||||||
|
|
||||||
|
Which letters are normalized can be controlled by specifying the
|
||||||
|
`unicodeSetFilter` parameter, which accepts a
|
||||||
|
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].
|
||||||
|
|
||||||
Here are two examples, the default usage and a customised character filter:
|
Here are two examples, the default usage and a customised character filter:
|
||||||
|
|
||||||
|
|
||||||
|
@ -189,6 +193,10 @@ without any further configuration. The type of normalization can be specified
|
||||||
with the `name` parameter, which accepts `nfc`, `nfkc`, and `nfkc_cf`
|
with the `name` parameter, which accepts `nfc`, `nfkc`, and `nfkc_cf`
|
||||||
(default).
|
(default).
|
||||||
|
|
||||||
|
Which letters are normalized can be controlled by specifying the
|
||||||
|
`unicodeSetFilter` parameter, which accepts a
|
||||||
|
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].
|
||||||
|
|
||||||
You should probably prefer the <<analysis-icu-normalization-charfilter,Normalization character filter>>.
|
You should probably prefer the <<analysis-icu-normalization-charfilter,Normalization character filter>>.
|
||||||
|
|
||||||
Here are two examples, the default usage and a customised token filter:
|
Here are two examples, the default usage and a customised token filter:
|
||||||
|
|
|
@ -19,9 +19,8 @@
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import com.ibm.icu.text.FilteredNormalizer2;
|
|
||||||
import com.ibm.icu.text.Normalizer2;
|
import com.ibm.icu.text.Normalizer2;
|
||||||
import com.ibm.icu.text.UnicodeSet;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
|
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
@ -41,31 +40,20 @@ import org.elasticsearch.index.IndexSettings;
|
||||||
* @author kimchy (shay.banon)
|
* @author kimchy (shay.banon)
|
||||||
*/
|
*/
|
||||||
public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||||
private final String unicodeSetFilter;
|
/** Store here the same Normalizer used by the lucene ICUFoldingFilter */
|
||||||
|
private static final Normalizer2 ICU_FOLDING_NORMALIZER = Normalizer2.getInstance(
|
||||||
|
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), "utr30", Normalizer2.Mode.COMPOSE);
|
||||||
|
|
||||||
|
private final Normalizer2 normalizer;
|
||||||
|
|
||||||
public IcuFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
public IcuFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||||
super(indexSettings, name, settings);
|
super(indexSettings, name, settings);
|
||||||
this.unicodeSetFilter = settings.get("unicodeSetFilter");
|
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(ICU_FOLDING_NORMALIZER, settings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, normalizer);
|
||||||
// The ICUFoldingFilter is in fact implemented as a ICUNormalizer2Filter.
|
|
||||||
// ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here
|
|
||||||
if (unicodeSetFilter != null) {
|
|
||||||
Normalizer2 base = Normalizer2.getInstance(
|
|
||||||
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
|
|
||||||
"utr30", Normalizer2.Mode.COMPOSE);
|
|
||||||
UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);
|
|
||||||
|
|
||||||
unicodeSet.freeze();
|
|
||||||
Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet);
|
|
||||||
return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return new ICUFoldingFilter(tokenStream);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
|
||||||
import com.ibm.icu.text.Normalizer2;
|
import com.ibm.icu.text.Normalizer2;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
|
import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
|
@ -33,22 +34,22 @@ import java.io.Reader;
|
||||||
* Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter} to normalize character.
|
* Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter} to normalize character.
|
||||||
* <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p>
|
* <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p>
|
||||||
* <p>The <tt>mode</tt> can be used to provide 'compose' or 'decompose'. Default is compose.</p>
|
* <p>The <tt>mode</tt> can be used to provide 'compose' or 'decompose'. Default is compose.</p>
|
||||||
|
* <p>The <tt>unicodeSetFilter</tt> attribute can be used to provide the UniCodeSet for filtering.</p>
|
||||||
*/
|
*/
|
||||||
public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory implements MultiTermAwareComponent {
|
public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory implements MultiTermAwareComponent {
|
||||||
|
|
||||||
private final String name;
|
|
||||||
|
|
||||||
private final Normalizer2 normalizer;
|
private final Normalizer2 normalizer;
|
||||||
|
|
||||||
public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||||
super(indexSettings, name);
|
super(indexSettings, name);
|
||||||
this.name = settings.get("name", "nfkc_cf");
|
String method = settings.get("name", "nfkc_cf");
|
||||||
String mode = settings.get("mode");
|
String mode = settings.get("mode");
|
||||||
if (!"compose".equals(mode) && !"decompose".equals(mode)) {
|
if (!"compose".equals(mode) && !"decompose".equals(mode)) {
|
||||||
mode = "compose";
|
mode = "compose";
|
||||||
}
|
}
|
||||||
this.normalizer = Normalizer2.getInstance(
|
Normalizer2 normalizer = Normalizer2.getInstance(
|
||||||
null, this.name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
|
null, method, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
|
||||||
|
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(normalizer, settings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -19,7 +19,10 @@
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import com.ibm.icu.text.FilteredNormalizer2;
|
||||||
import com.ibm.icu.text.Normalizer2;
|
import com.ibm.icu.text.Normalizer2;
|
||||||
|
import com.ibm.icu.text.UnicodeSet;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
|
@ -28,26 +31,40 @@ import org.elasticsearch.index.IndexSettings;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to normalize tokens.
|
* Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to normalize tokens.
|
||||||
* <p>The <tt>name</tt> can be used to provide the type of normalization to perform.
|
* <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p>
|
||||||
|
* <p>The <tt>unicodeSetFilter</tt> attribute can be used to provide the UniCodeSet for filtering.</p>
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||||
|
|
||||||
private final String name;
|
private final Normalizer2 normalizer;
|
||||||
|
|
||||||
public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||||
super(indexSettings, name, settings);
|
super(indexSettings, name, settings);
|
||||||
this.name = settings.get("name", "nfkc_cf");
|
String method = settings.get("name", "nfkc_cf");
|
||||||
|
Normalizer2 normalizer = Normalizer2.getInstance(null, method, Normalizer2.Mode.COMPOSE);
|
||||||
|
this.normalizer = wrapWithUnicodeSetFilter(normalizer, settings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE));
|
return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, normalizer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Object getMultiTermComponent() {
|
public Object getMultiTermComponent() {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static Normalizer2 wrapWithUnicodeSetFilter(final Normalizer2 normalizer, Settings settings) {
|
||||||
|
String unicodeSetFilter = settings.get("unicodeSetFilter");
|
||||||
|
if (unicodeSetFilter != null) {
|
||||||
|
UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);
|
||||||
|
|
||||||
|
unicodeSet.freeze();
|
||||||
|
return new FilteredNormalizer2(normalizer, unicodeSet);
|
||||||
|
}
|
||||||
|
return normalizer;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,3 +39,50 @@
|
||||||
tokenizer: keyword
|
tokenizer: keyword
|
||||||
- length: { tokens: 1 }
|
- length: { tokens: 1 }
|
||||||
- match: { tokens.0.token: foo bar resume }
|
- match: { tokens.0.token: foo bar resume }
|
||||||
|
---
|
||||||
|
"Normalization with a UnicodeSet Filter":
|
||||||
|
- do:
|
||||||
|
indices.create:
|
||||||
|
index: test
|
||||||
|
body:
|
||||||
|
settings:
|
||||||
|
index:
|
||||||
|
analysis:
|
||||||
|
char_filter:
|
||||||
|
charfilter_icu_normalizer:
|
||||||
|
type: icu_normalizer
|
||||||
|
unicodeSetFilter: "[^ß]"
|
||||||
|
filter:
|
||||||
|
tokenfilter_icu_normalizer:
|
||||||
|
type: icu_normalizer
|
||||||
|
unicodeSetFilter: "[^ßB]"
|
||||||
|
tokenfilter_icu_folding:
|
||||||
|
type: icu_folding
|
||||||
|
unicodeSetFilter: "[^â]"
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
index: test
|
||||||
|
body:
|
||||||
|
char_filter: ["charfilter_icu_normalizer"]
|
||||||
|
tokenizer: keyword
|
||||||
|
text: charfilter Föo Bâr Ruß
|
||||||
|
- length: { tokens: 1 }
|
||||||
|
- match: { tokens.0.token: charfilter föo bâr ruß }
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
index: test
|
||||||
|
body:
|
||||||
|
tokenizer: keyword
|
||||||
|
filter: ["tokenfilter_icu_normalizer"]
|
||||||
|
text: tokenfilter Föo Bâr Ruß
|
||||||
|
- length: { tokens: 1 }
|
||||||
|
- match: { tokens.0.token: tokenfilter föo Bâr ruß }
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
index: test
|
||||||
|
body:
|
||||||
|
tokenizer: keyword
|
||||||
|
filter: ["tokenfilter_icu_folding"]
|
||||||
|
text: icufolding Föo Bâr Ruß
|
||||||
|
- length: { tokens: 1 }
|
||||||
|
- match: { tokens.0.token: icufolding foo bâr russ }
|
||||||
|
|
Loading…
Reference in New Issue