From 26cbda41ea2dd1c5f24328860b5bfed50933b037 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 6 Dec 2016 10:01:04 +0100 Subject: [PATCH] AsciiFoldingFilter's multi-term component should never preserve the original token. (#21982) This ports the fix of https://issues.apache.org/jira/browse/LUCENE-7536 to Elasticsearch's ASCIIFoldingTokenFilterFactory. --- .../analysis/ASCIIFoldingTokenFilterFactory.java | 16 +++++++++++++++- .../ASCIIFoldingTokenFilterFactoryTests.java | 7 +++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java index b7417b26374..4318ef273dc 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java @@ -47,6 +47,20 @@ public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory i @Override public Object getMultiTermComponent() { - return this; + if (preserveOriginal == false) { + return this; + } else { + // See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning + return new TokenFilterFactory() { + @Override + public String name() { + return ASCIIFoldingTokenFilterFactory.this.name(); + } + @Override + public TokenStream create(TokenStream tokenStream) { + return new ASCIIFoldingFilter(tokenStream, false); + } + }; + } } } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactoryTests.java index d68cbaa9d30..973225df180 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactoryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactoryTests.java @@ -55,5 +55,12 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase { Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + + // but the multi-term aware component still emits a single token + tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter).getMultiTermComponent(); + tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + expected = new String[]{"Anspruche"}; + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); } }