AsciiFoldingFilter's multi-term component should never preserve the original token. (#21982)

This ports the fix of https://issues.apache.org/jira/browse/LUCENE-7536 to
Elasticsearch's ASCIIFoldingTokenFilterFactory.
This commit is contained in:
Adrien Grand 2016-12-06 10:01:04 +01:00 committed by GitHub
parent c8f241f284
commit 26cbda41ea
2 changed files with 22 additions and 1 deletions

View File

@ -47,6 +47,20 @@ public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory i
@Override @Override
public Object getMultiTermComponent() { public Object getMultiTermComponent() {
return this; if (preserveOriginal == false) {
return this;
} else {
// See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning
return new TokenFilterFactory() {
@Override
public String name() {
return ASCIIFoldingTokenFilterFactory.this.name();
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new ASCIIFoldingFilter(tokenStream, false);
}
};
}
} }
} }

View File

@ -55,5 +55,12 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
Tokenizer tokenizer = new WhitespaceTokenizer(); Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source)); tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected); assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
// but the multi-term aware component still emits a single token
tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter).getMultiTermComponent();
tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
expected = new String[]{"Anspruche"};
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
} }
} }