Remove the preserve_original option from the FingerprintAnalyzer (#18471)

The preserve_original option to the ASCIIFoldingFilter doesn't play well with the FingerprintFilter, as it ends up producing fingerprints like: "and consistent godel gödel is said sentence this yes" The goal of the OpenRefine algorithm is to product a small normalized ASCII fingerprint. There's no need to expose preserve_original.
2016-05-19 19:37:13 +02:00 · 2016-05-19 19:37:13 +02:00 · dc33a83231
parent 8486488627
commit dc33a83231
3 changed files with 7 additions and 24 deletions
--- a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java
@ -33,13 +33,11 @@ import org.apache.lucene.analysis.util.CharArraySet;
 public final class FingerprintAnalyzer extends Analyzer {
    private final char separator;
    private final int maxOutputSize;
    private final boolean preserveOriginal;
    private final CharArraySet stopWords;
-    public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize, boolean preserveOriginal) {
+    public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize) {
        this.separator = separator;
        this.maxOutputSize = maxOutputSize;
        this.preserveOriginal = preserveOriginal;
        this.stopWords = stopWords;
    }
@ -48,7 +46,7 @@ public final class FingerprintAnalyzer extends Analyzer {
        final Tokenizer tokenizer = new StandardTokenizer();
        TokenStream stream = tokenizer;
        stream = new LowerCaseFilter(stream);
-        stream = new ASCIIFoldingFilter(stream, preserveOriginal);
+        stream = new ASCIIFoldingFilter(stream, false);
        stream = new StopFilter(stream, stopWords);
        stream = new FingerprintFilter(stream, maxOutputSize, separator);
        return new TokenStreamComponents(tokenizer, stream);
--- a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java
@ -34,10 +34,8 @@ import org.elasticsearch.index.IndexSettings;
 public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
    public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE;
    public static ParseField PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.PRESERVE_ORIGINAL;
    public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE;
    public static boolean DEFAULT_PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.DEFAULT_PRESERVE_ORIGINAL;
    public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET;
    private final FingerprintAnalyzer analyzer;
@ -47,10 +45,9 @@ public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<A
        char separator = FingerprintTokenFilterFactory.parseSeparator(settings);
        int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE);
        boolean preserveOriginal = settings.getAsBoolean(PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL);
        CharArraySet stopWords = Analysis.parseStopWords(env, settings, DEFAULT_STOP_WORDS);
-        this.analyzer = new FingerprintAnalyzer(stopWords, separator, maxOutputSize, preserveOriginal);
+        this.analyzer = new FingerprintAnalyzer(stopWords, separator, maxOutputSize);
    }
    @Override
--- a/core/src/test/java/org/elasticsearch/index/analysis/FingerprintAnalyzerTests.java
+++ b/core/src/test/java/org/elasticsearch/index/analysis/FingerprintAnalyzerTests.java
@ -26,13 +26,13 @@ import org.elasticsearch.test.ESTokenStreamTestCase;
 public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
    public void testFingerprint() throws Exception {
-        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
        assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO",
            new String[]{"bar baz foo"});
    }
    public void testReusableTokenStream() throws Exception {
-        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
        assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO",
            new String[]{"bar baz foo"});
        assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc",
@ -40,7 +40,7 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
    }
    public void testAsciifolding() throws Exception {
-        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
        assertAnalyzesTo(a, "gödel escher bach",
            new String[]{"bach escher godel"});
@ -48,14 +48,8 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
            new String[]{"bach escher godel"});
    }
    public void testPreserveOriginal() throws Exception {
        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true);
        assertAnalyzesTo(a, "gödel escher bach",
            new String[]{"bach escher godel gödel"});
    }
    public void testLimit() throws Exception {
-        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 3, false);
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 3);
        assertAnalyzesTo(a, "e d c b a",
            new String[]{});
@ -63,10 +57,4 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
            new String[]{"a b"});
    }
    public void testSeparator() throws Exception {
        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, '_', 255, true);
        assertAnalyzesTo(a, "b c a",
            new String[]{"a_b_c"});
    }
 }