From 5ee5cc25ccaf4c11deb5a8e365df6c6071f126d2 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Thu, 12 May 2016 09:34:15 -0400 Subject: [PATCH] Move AsciiFolding earlier in FingerprintAnalyzer filter chain Rearranges the FingerprintAnalyzer so that AsciiFolding comes earlier in the chain (after lowercasing, before stop removal, for maximum deduping power) Closes #18266 --- .../index/analysis/FingerprintAnalyzer.java | 2 +- .../index/analysis/FingerprintAnalyzerTests.java | 6 +++++- .../analysis/analyzers/fingerprint-analyzer.asciidoc | 11 +++++------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java index 66fdbeaeb39..f7bf44256cc 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java @@ -48,9 +48,9 @@ public final class FingerprintAnalyzer extends Analyzer { final Tokenizer tokenizer = new StandardTokenizer(); TokenStream stream = tokenizer; stream = new LowerCaseFilter(stream); + stream = new ASCIIFoldingFilter(stream, preserveOriginal); stream = new StopFilter(stream, stopWords); stream = new FingerprintFilter(stream, maxOutputSize, separator); - stream = new ASCIIFoldingFilter(stream, preserveOriginal); return new TokenStreamComponents(tokenizer, stream); } } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/FingerprintAnalyzerTests.java b/core/src/test/java/org/elasticsearch/index/analysis/FingerprintAnalyzerTests.java index 0e4ed8f4fb7..8c1d530e448 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/FingerprintAnalyzerTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/FingerprintAnalyzerTests.java @@ -43,12 +43,15 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase { Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false); assertAnalyzesTo(a, "gödel escher bach", new String[]{"bach escher godel"}); + + assertAnalyzesTo(a, "gödel godel escher bach", + new String[]{"bach escher godel"}); } public void testPreserveOriginal() throws Exception { Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true); assertAnalyzesTo(a, "gödel escher bach", - new String[]{"bach escher godel", "bach escher gödel"}); + new String[]{"bach escher godel gödel"}); } public void testLimit() throws Exception { @@ -65,4 +68,5 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase { assertAnalyzesTo(a, "b c a", new String[]{"a_b_c"}); } + } diff --git a/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc b/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc index a66495acdbe..b393c883441 100644 --- a/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc @@ -17,11 +17,11 @@ It consists of: Tokenizer:: * <> -Token Filters:: -* <> -* <> (disabled by default) -* <> -* <> +Token Filters (in order):: +1. <> +2. <> +3. <> (disabled by default) +4. <> [float] === Example output @@ -68,7 +68,6 @@ The `fingerprint` analyzer accepts the following parameters: A pre-defined stop words list like `_english_` or an array containing a list of stop words. Defaults to `_none_`. - `stopwords_path`:: The path to a file containing stop words.