Deprecate unicodeSetFilter in favour of unicode_set_filter (#29215)
This commit is contained in:
parent
b6659847c2
commit
14f540e8e6
|
@ -38,7 +38,7 @@ normalization can be specified with the `name` parameter, which accepts `nfc`,
|
||||||
convert `nfc` to `nfd` or `nfkc` to `nfkd` respectively:
|
convert `nfc` to `nfd` or `nfkc` to `nfkd` respectively:
|
||||||
|
|
||||||
Which letters are normalized can be controlled by specifying the
|
Which letters are normalized can be controlled by specifying the
|
||||||
`unicodeSetFilter` parameter, which accepts a
|
`unicode_set_filter` parameter, which accepts a
|
||||||
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].
|
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].
|
||||||
|
|
||||||
Here are two examples, the default usage and a customised character filter:
|
Here are two examples, the default usage and a customised character filter:
|
||||||
|
@ -194,7 +194,7 @@ with the `name` parameter, which accepts `nfc`, `nfkc`, and `nfkc_cf`
|
||||||
(default).
|
(default).
|
||||||
|
|
||||||
Which letters are normalized can be controlled by specifying the
|
Which letters are normalized can be controlled by specifying the
|
||||||
`unicodeSetFilter` parameter, which accepts a
|
`unicode_set_filter` parameter, which accepts a
|
||||||
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].
|
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].
|
||||||
|
|
||||||
You should probably prefer the <<analysis-icu-normalization-charfilter,Normalization character filter>>.
|
You should probably prefer the <<analysis-icu-normalization-charfilter,Normalization character filter>>.
|
||||||
|
@ -273,7 +273,7 @@ The ICU folding token filter already does Unicode normalization, so there is
|
||||||
no need to use Normalize character or token filter as well.
|
no need to use Normalize character or token filter as well.
|
||||||
|
|
||||||
Which letters are folded can be controlled by specifying the
|
Which letters are folded can be controlled by specifying the
|
||||||
`unicodeSetFilter` parameter, which accepts a
|
`unicode_set_filter` parameter, which accepts a
|
||||||
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].
|
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].
|
||||||
|
|
||||||
The following example exempts Swedish characters from folding. It is important
|
The following example exempts Swedish characters from folding. It is important
|
||||||
|
@ -300,7 +300,7 @@ PUT icu_sample
|
||||||
"filter": {
|
"filter": {
|
||||||
"swedish_folding": {
|
"swedish_folding": {
|
||||||
"type": "icu_folding",
|
"type": "icu_folding",
|
||||||
"unicodeSetFilter": "[^åäöÅÄÖ]"
|
"unicode_set_filter": "[^åäöÅÄÖ]"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,7 +50,7 @@ public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory imp
|
||||||
|
|
||||||
public IcuFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
public IcuFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||||
super(indexSettings, name, settings);
|
super(indexSettings, name, settings);
|
||||||
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(ICU_FOLDING_NORMALIZER, settings);
|
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(indexSettings, ICU_FOLDING_NORMALIZER, settings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -49,7 +49,7 @@ public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory im
|
||||||
}
|
}
|
||||||
Normalizer2 normalizer = Normalizer2.getInstance(
|
Normalizer2 normalizer = Normalizer2.getInstance(
|
||||||
null, method, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
|
null, method, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
|
||||||
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(normalizer, settings);
|
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(indexSettings, normalizer, settings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -23,7 +23,10 @@ import com.ibm.icu.text.FilteredNormalizer2;
|
||||||
import com.ibm.icu.text.Normalizer2;
|
import com.ibm.icu.text.Normalizer2;
|
||||||
import com.ibm.icu.text.UnicodeSet;
|
import com.ibm.icu.text.UnicodeSet;
|
||||||
|
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.elasticsearch.Version;
|
||||||
|
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
|
@ -35,14 +38,15 @@ import org.elasticsearch.index.IndexSettings;
|
||||||
* <p>The {@code unicodeSetFilter} attribute can be used to provide the UniCodeSet for filtering.</p>
|
* <p>The {@code unicodeSetFilter} attribute can be used to provide the UniCodeSet for filtering.</p>
|
||||||
*/
|
*/
|
||||||
public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||||
|
private final static DeprecationLogger deprecationLogger =
|
||||||
|
new DeprecationLogger(LogManager.getLogger(IcuNormalizerTokenFilterFactory.class));
|
||||||
private final Normalizer2 normalizer;
|
private final Normalizer2 normalizer;
|
||||||
|
|
||||||
public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||||
super(indexSettings, name, settings);
|
super(indexSettings, name, settings);
|
||||||
String method = settings.get("name", "nfkc_cf");
|
String method = settings.get("name", "nfkc_cf");
|
||||||
Normalizer2 normalizer = Normalizer2.getInstance(null, method, Normalizer2.Mode.COMPOSE);
|
Normalizer2 normalizer = Normalizer2.getInstance(null, method, Normalizer2.Mode.COMPOSE);
|
||||||
this.normalizer = wrapWithUnicodeSetFilter(normalizer, settings);
|
this.normalizer = wrapWithUnicodeSetFilter(indexSettings, normalizer, settings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -55,8 +59,17 @@ public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Normalizer2 wrapWithUnicodeSetFilter(final Normalizer2 normalizer, Settings settings) {
|
static Normalizer2 wrapWithUnicodeSetFilter(final IndexSettings indexSettings,
|
||||||
|
final Normalizer2 normalizer,
|
||||||
|
final Settings settings) {
|
||||||
String unicodeSetFilter = settings.get("unicodeSetFilter");
|
String unicodeSetFilter = settings.get("unicodeSetFilter");
|
||||||
|
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
|
||||||
|
if (unicodeSetFilter != null) {
|
||||||
|
deprecationLogger.deprecated("[unicodeSetFilter] has been deprecated in favor of [unicode_set_filter]");
|
||||||
|
} else {
|
||||||
|
unicodeSetFilter = settings.get("unicode_set_filter");
|
||||||
|
}
|
||||||
|
}
|
||||||
if (unicodeSetFilter != null) {
|
if (unicodeSetFilter != null) {
|
||||||
UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);
|
UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);
|
||||||
|
|
||||||
|
|
|
@ -48,6 +48,61 @@
|
||||||
---
|
---
|
||||||
"Normalization with a UnicodeSet Filter":
|
"Normalization with a UnicodeSet Filter":
|
||||||
- do:
|
- do:
|
||||||
|
indices.create:
|
||||||
|
index: test
|
||||||
|
body:
|
||||||
|
settings:
|
||||||
|
index:
|
||||||
|
analysis:
|
||||||
|
char_filter:
|
||||||
|
charfilter_icu_normalizer:
|
||||||
|
type: icu_normalizer
|
||||||
|
unicode_set_filter: "[^ß]"
|
||||||
|
filter:
|
||||||
|
tokenfilter_icu_normalizer:
|
||||||
|
type: icu_normalizer
|
||||||
|
unicode_set_filter: "[^ßB]"
|
||||||
|
tokenfilter_icu_folding:
|
||||||
|
type: icu_folding
|
||||||
|
unicode_set_filter: "[^â]"
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
index: test
|
||||||
|
body:
|
||||||
|
char_filter: ["charfilter_icu_normalizer"]
|
||||||
|
tokenizer: keyword
|
||||||
|
text: charfilter Föo Bâr Ruß
|
||||||
|
- length: { tokens: 1 }
|
||||||
|
- match: { tokens.0.token: charfilter föo bâr ruß }
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
index: test
|
||||||
|
body:
|
||||||
|
tokenizer: keyword
|
||||||
|
filter: ["tokenfilter_icu_normalizer"]
|
||||||
|
text: tokenfilter Föo Bâr Ruß
|
||||||
|
- length: { tokens: 1 }
|
||||||
|
- match: { tokens.0.token: tokenfilter föo Bâr ruß }
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
index: test
|
||||||
|
body:
|
||||||
|
tokenizer: keyword
|
||||||
|
filter: ["tokenfilter_icu_folding"]
|
||||||
|
text: icufolding Föo Bâr Ruß
|
||||||
|
- length: { tokens: 1 }
|
||||||
|
- match: { tokens.0.token: icufolding foo bâr russ }
|
||||||
|
|
||||||
|
---
|
||||||
|
"Normalization with a CamcelCase UnicodeSet Filter":
|
||||||
|
- skip:
|
||||||
|
version: " - 6.99.99"
|
||||||
|
reason: unicodeSetFilter deprecated in 7.0.0, replaced by unicode_set_filter
|
||||||
|
features: "warnings"
|
||||||
|
|
||||||
|
- do:
|
||||||
|
warnings:
|
||||||
|
- "[unicodeSetFilter] has been deprecated in favor of [unicode_set_filter]"
|
||||||
indices.create:
|
indices.create:
|
||||||
index: test
|
index: test
|
||||||
body:
|
body:
|
||||||
|
|
Loading…
Reference in New Issue