Move AsciiFolding earlier in FingerprintAnalyzer filter chain

Rearranges the FingerprintAnalyzer so that AsciiFolding comes earlier in the chain (after lowercasing, before stop removal, for maximum deduping power)

Closes #18266
This commit is contained in:
Zachary Tong 2016-05-12 09:34:15 -04:00
parent 9ce96f5792
commit 5ee5cc25cc
3 changed files with 11 additions and 8 deletions

View File

@ -48,9 +48,9 @@ public final class FingerprintAnalyzer extends Analyzer {
final Tokenizer tokenizer = new StandardTokenizer(); final Tokenizer tokenizer = new StandardTokenizer();
TokenStream stream = tokenizer; TokenStream stream = tokenizer;
stream = new LowerCaseFilter(stream); stream = new LowerCaseFilter(stream);
stream = new ASCIIFoldingFilter(stream, preserveOriginal);
stream = new StopFilter(stream, stopWords); stream = new StopFilter(stream, stopWords);
stream = new FingerprintFilter(stream, maxOutputSize, separator); stream = new FingerprintFilter(stream, maxOutputSize, separator);
stream = new ASCIIFoldingFilter(stream, preserveOriginal);
return new TokenStreamComponents(tokenizer, stream); return new TokenStreamComponents(tokenizer, stream);
} }
} }

View File

@ -43,12 +43,15 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false); Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
assertAnalyzesTo(a, "gödel escher bach", assertAnalyzesTo(a, "gödel escher bach",
new String[]{"bach escher godel"}); new String[]{"bach escher godel"});
assertAnalyzesTo(a, "gödel godel escher bach",
new String[]{"bach escher godel"});
} }
public void testPreserveOriginal() throws Exception { public void testPreserveOriginal() throws Exception {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true); Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true);
assertAnalyzesTo(a, "gödel escher bach", assertAnalyzesTo(a, "gödel escher bach",
new String[]{"bach escher godel", "bach escher gödel"}); new String[]{"bach escher godel gödel"});
} }
public void testLimit() throws Exception { public void testLimit() throws Exception {
@ -65,4 +68,5 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
assertAnalyzesTo(a, "b c a", assertAnalyzesTo(a, "b c a",
new String[]{"a_b_c"}); new String[]{"a_b_c"});
} }
} }

View File

@ -17,11 +17,11 @@ It consists of:
Tokenizer:: Tokenizer::
* <<analysis-standard-tokenizer,Standard Tokenizer>> * <<analysis-standard-tokenizer,Standard Tokenizer>>
Token Filters:: Token Filters (in order)::
* <<analysis-lowercase-tokenfilter,Lower Case Token Filter>> 1. <<analysis-lowercase-tokenfilter,Lower Case Token Filter>>
* <<analysis-stop-tokenfilter,Stop Token Filter>> (disabled by default) 2. <<analysis-asciifolding-tokenfilter>>
* <<analysis-fingerprint-tokenfilter>> 3. <<analysis-stop-tokenfilter,Stop Token Filter>> (disabled by default)
* <<analysis-asciifolding-tokenfilter>> 4. <<analysis-fingerprint-tokenfilter>>
[float] [float]
=== Example output === Example output
@ -68,7 +68,6 @@ The `fingerprint` analyzer accepts the following parameters:
A pre-defined stop words list like `_english_` or an array containing a A pre-defined stop words list like `_english_` or an array containing a
list of stop words. Defaults to `_none_`. list of stop words. Defaults to `_none_`.
`stopwords_path`:: `stopwords_path`::
The path to a file containing stop words. The path to a file containing stop words.