Add `fingerprint` token filter and `fingerprint` analyzer

Adds a `fingerprint` token filter which uses Lucene's FingerprintFilter, and a `fingerprint` analyzer that combines the Fingerprint filter with lowercasing, stop word removal and asciifolding. Closes #13325
2016-04-20 16:10:56 -04:00 · 2016-04-20 16:10:56 -04:00 · 80288ad60c
parent 1f5fd3094f
commit 80288ad60c
10 changed files with 333 additions and 1 deletions
--- a/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java
@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis;

 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
@ -29,11 +30,14 @@ import org.elasticsearch.index.IndexSettings;
 * Factory for ASCIIFoldingFilter.
 */
 public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory {
+    public static ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
+    public static boolean DEFAULT_PRESERVE_ORIGINAL = false;
+
    private final boolean preserveOriginal;

    public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
        super(indexSettings, name, settings);
-        preserveOriginal = settings.getAsBoolean("preserve_original", false);
+        preserveOriginal = settings.getAsBoolean(PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL);
    }

    @Override
--- a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java
@ -258,6 +258,7 @@ public final class AnalysisRegistry implements Closeable {
        tokenFilters.put("apostrophe", ApostropheFilterFactory::new);
        tokenFilters.put("classic", ClassicFilterFactory::new);
        tokenFilters.put("decimal_digit", DecimalDigitFilterFactory::new);
+        tokenFilters.put("fingerprint", FingerprintTokenFilterFactory::new);
    }

    private void registerBuiltInAnalyzer(Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider>> analyzers) {
@ -304,6 +305,7 @@ public final class AnalysisRegistry implements Closeable {
        analyzers.put("swedish", SwedishAnalyzerProvider::new);
        analyzers.put("turkish", TurkishAnalyzerProvider::new);
        analyzers.put("thai", ThaiAnalyzerProvider::new);
+        analyzers.put("fingerprint", FingerprintAnalyzerProvider::new);
    }

    private <T> Map<String, T> buildMapping(boolean analyzer, String toBuild, IndexSettings settings, Map<String, Settings> settingsMap, Map<String, AnalysisModule.AnalysisProvider<T>> providerMap, Map<String, AnalysisModule.AnalysisProvider<T>> defaultInstance) throws IOException {
--- a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java
@ -0,0 +1,56 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+/** OpenRefine Fingerprinting, which uses a Standard tokenizer and lowercase + stop + fingerprint + asciifolding filters */
+public final class FingerprintAnalyzer extends Analyzer {
+    private final char separator;
+    private final int maxOutputSize;
+    private final boolean preserveOriginal;
+    private final CharArraySet stopWords;
+
+    public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize, boolean preserveOriginal) {
+        this.separator = separator;
+        this.maxOutputSize = maxOutputSize;
+        this.preserveOriginal = preserveOriginal;
+        this.stopWords = stopWords;
+    }
+
+    @Override
+    protected TokenStreamComponents createComponents(String s) {
+        final Tokenizer tokenizer = new StandardTokenizer();
+        TokenStream stream = tokenizer;
+        stream = new LowerCaseFilter(stream);
+        stream = new StopFilter(stream, stopWords);
+        stream = new FingerprintFilter(stream, maxOutputSize, separator);
+        stream = new ASCIIFoldingFilter(stream, preserveOriginal);
+        return new TokenStreamComponents(tokenizer, stream);
+    }
+}
--- a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java
@ -0,0 +1,60 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+
+/**
+ * Builds an OpenRefine Fingerprint analyzer.  Uses the default settings from the various components
+ * (Standard Tokenizer and lowercase + stop + fingerprint + ascii-folding filters)
+ */
+public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
+
+    public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE;
+    public static ParseField PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.PRESERVE_ORIGINAL;
+
+    public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE;
+    public static boolean DEFAULT_PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.DEFAULT_PRESERVE_ORIGINAL;
+    public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET;
+
+    private final FingerprintAnalyzer analyzer;
+
+    public FingerprintAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        super(indexSettings, name, settings);
+
+        char separator = FingerprintTokenFilterFactory.parseSeparator(settings);
+        int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE);
+        boolean preserveOriginal = settings.getAsBoolean(PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL);
+        CharArraySet stopWords = Analysis.parseStopWords(env, settings, DEFAULT_STOP_WORDS);
+
+        this.analyzer = new FingerprintAnalyzer(stopWords, separator, maxOutputSize, preserveOriginal);
+    }
+
+    @Override
+    public FingerprintAnalyzer get() {
+        return analyzer;
+    }
+}
--- a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintTokenFilterFactory.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintTokenFilterFactory.java
@ -0,0 +1,69 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+
+/**
+ *
+ */
+public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
+
+    private final char separator;
+    private final int maxOutputSize;
+
+    public static ParseField SEPARATOR = new ParseField("separator");
+    public static ParseField MAX_OUTPUT_SIZE = new ParseField("max_output_size");
+
+    public static final char DEFAULT_SEPARATOR  = ' ';
+    public static final int DEFAULT_MAX_OUTPUT_SIZE = 255;
+
+    public FingerprintTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+        this.separator = parseSeparator(settings);
+        this.maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),
+            FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        TokenStream result = tokenStream;
+        result = new FingerprintFilter(result, maxOutputSize, separator);
+        return result;
+    }
+
+    public static char parseSeparator(Settings settings) throws IllegalArgumentException {
+        String customSeparator = settings.get(SEPARATOR.getPreferredName());
+        if (customSeparator == null) {
+            return FingerprintTokenFilterFactory.DEFAULT_SEPARATOR;
+        } else if (customSeparator.length() == 1) {
+            return customSeparator.charAt(0);
+        }
+
+        throw new IllegalArgumentException("Setting [separator] must be a single, non-null character. ["
+            + customSeparator + "] was provided.");
+    }
+}
--- a/core/src/test/java/org/elasticsearch/index/analysis/FingerprintAnalyzerTests.java
+++ b/core/src/test/java/org/elasticsearch/index/analysis/FingerprintAnalyzerTests.java
@ -0,0 +1,68 @@
+package org.elasticsearch.index.analysis;
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.elasticsearch.test.ESTokenStreamTestCase;
+
+public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
+
+    public void testFingerprint() throws Exception {
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
+        assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO",
+            new String[]{"bar baz foo"});
+    }
+
+    public void testReusableTokenStream() throws Exception {
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
+        assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO",
+            new String[]{"bar baz foo"});
+        assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc",
+            new String[]{"123.2 abc xyz"});
+    }
+
+    public void testAsciifolding() throws Exception {
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
+        assertAnalyzesTo(a, "gödel escher bach",
+            new String[]{"bach escher godel"});
+    }
+
+    public void testPreserveOriginal() throws Exception {
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true);
+        assertAnalyzesTo(a, "gödel escher bach",
+            new String[]{"bach escher godel", "bach escher gödel"});
+    }
+
+    public void testLimit() throws Exception {
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 3, false);
+        assertAnalyzesTo(a, "e d c b a",
+            new String[]{});
+
+        assertAnalyzesTo(a, "b a",
+            new String[]{"a b"});
+    }
+
+    public void testSeparator() throws Exception {
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, '_', 255, true);
+        assertAnalyzesTo(a, "b c a",
+            new String[]{"a_b_c"});
+    }
+}
--- a/docs/reference/analysis/analyzers.asciidoc
+++ b/docs/reference/analysis/analyzers.asciidoc
@ -79,5 +79,7 @@ include::analyzers/lang-analyzer.asciidoc[]

 include::analyzers/snowball-analyzer.asciidoc[]

+include::analyzers/fingerprint-analyzer.asciidoc[]
+
 include::analyzers/custom-analyzer.asciidoc[]

--- a/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc
@ -0,0 +1,41 @@
+[[analysis-fingerprint-analyzer]]
+=== Fingerprint Analyzer
+
+The `fingerprint` analyzer implements a
+https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth#fingerprint[fingerprinting algorithm]
+which is used by the OpenRefine project to assist in clustering.
+
+The `fingerprint` analyzer is composed of a <<analysis-standard-tokenizer>>, and four
+token filters (in this order): <<analysis-lowercase-tokenfilter>>, <<analysis-stop-tokenfilter>>,
+<<analysis-fingerprint-tokenfilter>> and <<analysis-asciifolding-tokenfilter>>.
+
+Input text is lowercased, normalized to remove extended characters, sorted, deduplicated and
+concatenated into a single token.  If a stopword list is configured, stop words will
+also be removed. For example, the sentence:
+
+____
+"Yes yes, Gödel said this sentence is consistent and."
+____
+
+will be transformed into the token: `"and consistent godel is said sentence this yes"`
+
+
+Notice how the words are all lowercased, the umlaut in "gödel" has been normalized to "godel",
+punctuation has been removed, and "yes" has been de-duplicated.
+
+The `fingerprint` analyzer has these configurable settings
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`separator` | The character that separates the tokens after concatenation.
+Defaults to a space.
+|`max_output_size` | The maximum token size to emit. Defaults to `255`. See <<analysis-fingerprint-tokenfilter-max-size>>
+|`preserve_original`| If true, emits both the original and folded version of
+ tokens that contain extended characters.  Defaults to `false`
+|`stopwords` | A list of stop words to use. Defaults to an empty list (`_none_`).
+|`stopwords_path` | A path (either relative to `config` location, or absolute) to a stopwords
+                        file configuration. Each stop word should be in its own "line" (separated
+                        by a line break). The file must be UTF-8 encoded.
+|=======================================================================
+
--- a/docs/reference/analysis/tokenfilters.asciidoc
+++ b/docs/reference/analysis/tokenfilters.asciidoc
@ -86,3 +86,5 @@ include::tokenfilters/classic-tokenfilter.asciidoc[]
 include::tokenfilters/apostrophe-tokenfilter.asciidoc[]

 include::tokenfilters/decimal-digit-tokenfilter.asciidoc[]
+
+include::tokenfilters/fingerprint-tokenfilter.asciidoc[]
--- a/docs/reference/analysis/tokenfilters/fingerprint-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/fingerprint-tokenfilter.asciidoc
@ -0,0 +1,28 @@
+[[analysis-fingerprint-tokenfilter]]
+=== Fingerprint Token Filter
+
+The `fingerprint` token filter emits a single token which is useful for fingerprinting
+a body of text, and/or providing a token that can be clustered on.  It does this by
+sorting the tokens, deduplicating and then concatenating them back into a single token.
+
+For example, the tokens `["the", "quick", "quick", "brown", "fox", "was", "very", "brown"]` will be
+transformed into a single token: `"brown fox quick the very was"`.  Notice how the tokens were sorted
+alphabetically, and there is only one `"quick"`.
+
+The following are settings that can be set for a `fingerprint` token
+filter type:
+
+[cols="<,<",options="header",]
+|======================================================
+|Setting |Description
+|`separator` |Defaults to a space.
+|`max_output_size` |Defaults to `255`.
+|======================================================
+
+[[analysis-fingerprint-tokenfilter-max-size]]
+==== Maximum token size
+
+Because a field may have many unique tokens, it is important to set a cutoff so that fields do not grow
+too large.  The `max_output_size` setting controls this behavior.  If the concatenated fingerprint
+grows larger than `max_output_size`, the token filter will exit and will not emit a token (e.g. the
+field will be empty).