Fix failing ICU tests (#35207)

Fixes #35173
This commit is contained in:
Alan Woodward 2018-11-06 09:02:40 +00:00 committed by GitHub
parent 833e0f8ecf
commit 9f4b93fd5e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 27 additions and 41 deletions

View File

@ -40,9 +40,3 @@ dependencyLicenses {
mapping from: /lucene-.*/, to: 'lucene'
}
// Muted: https://github.com/elastic/elasticsearch/issues/35173
integTestRunner {
systemProperty 'tests.rest.blacklist',
'analysis_icu/10_basic/Normalization with a UnicodeSet Filter,' +
'analysis_icu/10_basic/Normalization with a CamcelCase UnicodeSet Filter'
}

View File

@ -38,8 +38,10 @@ import org.elasticsearch.index.IndexSettings;
* <p>The {@code unicodeSetFilter} attribute can be used to provide the UniCodeSet for filtering.</p>
*/
public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
private static final DeprecationLogger deprecationLogger =
new DeprecationLogger(LogManager.getLogger(IcuNormalizerTokenFilterFactory.class));
private final Normalizer2 normalizer;
public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {

View File

@ -46,7 +46,7 @@
- match: { tokens.1.token: bar }
- match: { tokens.2.token: resume }
---
"Normalization with a UnicodeSet Filter":
"Normalization with unicode_set_filter":
- do:
indices.create:
index: test
@ -70,31 +70,42 @@
index: test
body:
char_filter: ["charfilter_icu_normalizer"]
tokenizer: keyword
tokenizer: standard
text: charfilter Föo Bâr Ruß
- length: { tokens: 1 }
- match: { tokens.0.token: charfilter föo bâr ruß }
- length: { tokens: 4 }
- match: { tokens.0.token: charfilter }
- match: { tokens.1.token: föo }
- match: { tokens.2.token: bâr }
- match: { tokens.3.token: ruß }
- do:
indices.analyze:
index: test
body:
tokenizer: keyword
tokenizer: standard
filter: ["tokenfilter_icu_normalizer"]
text: tokenfilter Föo Bâr Ruß
- length: { tokens: 1 }
- match: { tokens.0.token: tokenfilter föo Bâr ruß }
- length: { tokens: 4 }
- match: { tokens.0.token: tokenfilter }
- match: { tokens.1.token: föo }
- match: { tokens.2.token: Bâr }
- match: { tokens.3.token: ruß }
- do:
indices.analyze:
index: test
body:
tokenizer: keyword
tokenizer: standard
filter: ["tokenfilter_icu_folding"]
text: icufolding Föo Bâr Ruß
- length: { tokens: 1 }
- match: { tokens.0.token: icufolding foo bâr russ }
- length: { tokens: 4 }
- match: { tokens.0.token: icufolding }
- match: { tokens.1.token: foo }
- match: { tokens.2.token: bâr }
- match: { tokens.3.token: russ }
---
"Normalization with a CamcelCase UnicodeSet Filter":
"Normalization with deprecated unicodeSetFilter":
- skip:
version: " - 6.99.99"
reason: unicodeSetFilter deprecated in 7.0.0, replaced by unicode_set_filter
@ -121,6 +132,8 @@
type: icu_folding
unicodeSetFilter: "[^â]"
- do:
warnings:
- "[unicodeSetFilter] has been deprecated in favor of [unicode_set_filter]"
indices.analyze:
index: test
body:
@ -132,27 +145,4 @@
- match: { tokens.1.token: föo }
- match: { tokens.2.token: bâr }
- match: { tokens.3.token: ruß }
- do:
indices.analyze:
index: test
body:
tokenizer: standard
filter: ["tokenfilter_icu_normalizer"]
text: tokenfilter Föo Bâr Ruß
- length: { tokens: 4 }
- match: { tokens.0.token: tokenfilter }
- match: { tokens.1.token: föo }
- match: { tokens.2.token: Bâr }
- match: { tokens.3.token: ruß }
- do:
indices.analyze:
index: test
body:
tokenizer: standard
filter: ["tokenfilter_icu_folding"]
text: icufolding Föo Bâr Ruß
- length: { tokens: 4 }
- match: { tokens.0.token: icufolding }
- match: { tokens.1.token: foo }
- match: { tokens.2.token: bâr }
- match: { tokens.3.token: russ }