AsciiFoldingFilter's multi-term component should never preserve the original token. (#21982)
This ports the fix of https://issues.apache.org/jira/browse/LUCENE-7536 to Elasticsearch's ASCIIFoldingTokenFilterFactory.
This commit is contained in:
parent
c8f241f284
commit
26cbda41ea
|
@ -47,6 +47,20 @@ public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory i
|
|||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
if (preserveOriginal == false) {
|
||||
return this;
|
||||
} else {
|
||||
// See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning
|
||||
return new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return ASCIIFoldingTokenFilterFactory.this.name();
|
||||
}
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ASCIIFoldingFilter(tokenStream, false);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -55,5 +55,12 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
|||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
||||
// but the multi-term aware component still emits a single token
|
||||
tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter).getMultiTermComponent();
|
||||
tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
expected = new String[]{"Anspruche"};
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue