From dc33a8323188416df4e85d0ab762a750a482611b Mon Sep 17 00:00:00 2001 From: Clinton Gormley Date: Thu, 19 May 2016 19:37:13 +0200 Subject: [PATCH] Remove the preserve_original option from the FingerprintAnalyzer (#18471) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The preserve_original option to the ASCIIFoldingFilter doesn't play well with the FingerprintFilter, as it ends up producing fingerprints like: "and consistent godel gödel is said sentence this yes" The goal of the OpenRefine algorithm is to product a small normalized ASCII fingerprint. There's no need to expose preserve_original. --- .../index/analysis/FingerprintAnalyzer.java | 6 ++---- .../analysis/FingerprintAnalyzerProvider.java | 5 +---- .../analysis/FingerprintAnalyzerTests.java | 20 ++++--------------- 3 files changed, 7 insertions(+), 24 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java index f7bf44256cc..985a081ccc8 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java @@ -33,13 +33,11 @@ import org.apache.lucene.analysis.util.CharArraySet; public final class FingerprintAnalyzer extends Analyzer { private final char separator; private final int maxOutputSize; - private final boolean preserveOriginal; private final CharArraySet stopWords; - public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize, boolean preserveOriginal) { + public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize) { this.separator = separator; this.maxOutputSize = maxOutputSize; - this.preserveOriginal = preserveOriginal; this.stopWords = stopWords; } @@ -48,7 +46,7 @@ public final class FingerprintAnalyzer extends Analyzer { final Tokenizer tokenizer = new StandardTokenizer(); TokenStream stream = tokenizer; stream = new LowerCaseFilter(stream); - stream = new ASCIIFoldingFilter(stream, preserveOriginal); + stream = new ASCIIFoldingFilter(stream, false); stream = new StopFilter(stream, stopWords); stream = new FingerprintFilter(stream, maxOutputSize, separator); return new TokenStreamComponents(tokenizer, stream); diff --git a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java index 897068cbf8b..bb8a51e0969 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java @@ -34,10 +34,8 @@ import org.elasticsearch.index.IndexSettings; public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider { public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE; - public static ParseField PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.PRESERVE_ORIGINAL; public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE; - public static boolean DEFAULT_PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.DEFAULT_PRESERVE_ORIGINAL; public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET; private final FingerprintAnalyzer analyzer; @@ -47,10 +45,9 @@ public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider