Move AsciiFolding earlier in FingerprintAnalyzer filter chain
Rearranges the FingerprintAnalyzer so that AsciiFolding comes earlier in the chain (after lowercasing, before stop removal, for maximum deduping power) Closes #18266
This commit is contained in:
parent
9ce96f5792
commit
5ee5cc25cc
|
@ -48,9 +48,9 @@ public final class FingerprintAnalyzer extends Analyzer {
|
||||||
final Tokenizer tokenizer = new StandardTokenizer();
|
final Tokenizer tokenizer = new StandardTokenizer();
|
||||||
TokenStream stream = tokenizer;
|
TokenStream stream = tokenizer;
|
||||||
stream = new LowerCaseFilter(stream);
|
stream = new LowerCaseFilter(stream);
|
||||||
|
stream = new ASCIIFoldingFilter(stream, preserveOriginal);
|
||||||
stream = new StopFilter(stream, stopWords);
|
stream = new StopFilter(stream, stopWords);
|
||||||
stream = new FingerprintFilter(stream, maxOutputSize, separator);
|
stream = new FingerprintFilter(stream, maxOutputSize, separator);
|
||||||
stream = new ASCIIFoldingFilter(stream, preserveOriginal);
|
|
||||||
return new TokenStreamComponents(tokenizer, stream);
|
return new TokenStreamComponents(tokenizer, stream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,12 +43,15 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
|
||||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
|
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
|
||||||
assertAnalyzesTo(a, "gödel escher bach",
|
assertAnalyzesTo(a, "gödel escher bach",
|
||||||
new String[]{"bach escher godel"});
|
new String[]{"bach escher godel"});
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "gödel godel escher bach",
|
||||||
|
new String[]{"bach escher godel"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testPreserveOriginal() throws Exception {
|
public void testPreserveOriginal() throws Exception {
|
||||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true);
|
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true);
|
||||||
assertAnalyzesTo(a, "gödel escher bach",
|
assertAnalyzesTo(a, "gödel escher bach",
|
||||||
new String[]{"bach escher godel", "bach escher gödel"});
|
new String[]{"bach escher godel gödel"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testLimit() throws Exception {
|
public void testLimit() throws Exception {
|
||||||
|
@ -65,4 +68,5 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
|
||||||
assertAnalyzesTo(a, "b c a",
|
assertAnalyzesTo(a, "b c a",
|
||||||
new String[]{"a_b_c"});
|
new String[]{"a_b_c"});
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,11 +17,11 @@ It consists of:
|
||||||
Tokenizer::
|
Tokenizer::
|
||||||
* <<analysis-standard-tokenizer,Standard Tokenizer>>
|
* <<analysis-standard-tokenizer,Standard Tokenizer>>
|
||||||
|
|
||||||
Token Filters::
|
Token Filters (in order)::
|
||||||
* <<analysis-lowercase-tokenfilter,Lower Case Token Filter>>
|
1. <<analysis-lowercase-tokenfilter,Lower Case Token Filter>>
|
||||||
* <<analysis-stop-tokenfilter,Stop Token Filter>> (disabled by default)
|
2. <<analysis-asciifolding-tokenfilter>>
|
||||||
* <<analysis-fingerprint-tokenfilter>>
|
3. <<analysis-stop-tokenfilter,Stop Token Filter>> (disabled by default)
|
||||||
* <<analysis-asciifolding-tokenfilter>>
|
4. <<analysis-fingerprint-tokenfilter>>
|
||||||
|
|
||||||
[float]
|
[float]
|
||||||
=== Example output
|
=== Example output
|
||||||
|
@ -68,7 +68,6 @@ The `fingerprint` analyzer accepts the following parameters:
|
||||||
|
|
||||||
A pre-defined stop words list like `_english_` or an array containing a
|
A pre-defined stop words list like `_english_` or an array containing a
|
||||||
list of stop words. Defaults to `_none_`.
|
list of stop words. Defaults to `_none_`.
|
||||||
|
|
||||||
`stopwords_path`::
|
`stopwords_path`::
|
||||||
|
|
||||||
The path to a file containing stop words.
|
The path to a file containing stop words.
|
||||||
|
|
Loading…
Reference in New Issue