Move AsciiFolding earlier in FingerprintAnalyzer filter chain
Rearranges the FingerprintAnalyzer so that AsciiFolding comes earlier in the chain (after lowercasing, before stop removal, for maximum deduping power) Closes #18266
This commit is contained in:
parent
9ce96f5792
commit
5ee5cc25cc
|
@ -48,9 +48,9 @@ public final class FingerprintAnalyzer extends Analyzer {
|
|||
final Tokenizer tokenizer = new StandardTokenizer();
|
||||
TokenStream stream = tokenizer;
|
||||
stream = new LowerCaseFilter(stream);
|
||||
stream = new ASCIIFoldingFilter(stream, preserveOriginal);
|
||||
stream = new StopFilter(stream, stopWords);
|
||||
stream = new FingerprintFilter(stream, maxOutputSize, separator);
|
||||
stream = new ASCIIFoldingFilter(stream, preserveOriginal);
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,12 +43,15 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
|
|||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
|
||||
assertAnalyzesTo(a, "gödel escher bach",
|
||||
new String[]{"bach escher godel"});
|
||||
|
||||
assertAnalyzesTo(a, "gödel godel escher bach",
|
||||
new String[]{"bach escher godel"});
|
||||
}
|
||||
|
||||
public void testPreserveOriginal() throws Exception {
|
||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true);
|
||||
assertAnalyzesTo(a, "gödel escher bach",
|
||||
new String[]{"bach escher godel", "bach escher gödel"});
|
||||
new String[]{"bach escher godel gödel"});
|
||||
}
|
||||
|
||||
public void testLimit() throws Exception {
|
||||
|
@ -65,4 +68,5 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
|
|||
assertAnalyzesTo(a, "b c a",
|
||||
new String[]{"a_b_c"});
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,11 +17,11 @@ It consists of:
|
|||
Tokenizer::
|
||||
* <<analysis-standard-tokenizer,Standard Tokenizer>>
|
||||
|
||||
Token Filters::
|
||||
* <<analysis-lowercase-tokenfilter,Lower Case Token Filter>>
|
||||
* <<analysis-stop-tokenfilter,Stop Token Filter>> (disabled by default)
|
||||
* <<analysis-fingerprint-tokenfilter>>
|
||||
* <<analysis-asciifolding-tokenfilter>>
|
||||
Token Filters (in order)::
|
||||
1. <<analysis-lowercase-tokenfilter,Lower Case Token Filter>>
|
||||
2. <<analysis-asciifolding-tokenfilter>>
|
||||
3. <<analysis-stop-tokenfilter,Stop Token Filter>> (disabled by default)
|
||||
4. <<analysis-fingerprint-tokenfilter>>
|
||||
|
||||
[float]
|
||||
=== Example output
|
||||
|
@ -68,7 +68,6 @@ The `fingerprint` analyzer accepts the following parameters:
|
|||
|
||||
A pre-defined stop words list like `_english_` or an array containing a
|
||||
list of stop words. Defaults to `_none_`.
|
||||
|
||||
`stopwords_path`::
|
||||
|
||||
The path to a file containing stop words.
|
||||
|
|
Loading…
Reference in New Issue