Add `fingerprint` token filter and `fingerprint` analyzer

Adds a `fingerprint` token filter which uses Lucene's FingerprintFilter,
and a `fingerprint` analyzer that combines the Fingerprint filter with
lowercasing, stop word removal and asciifolding.

Closes #13325
This commit is contained in:
Zachary Tong 2016-04-20 16:10:56 -04:00
parent 1f5fd3094f
commit 80288ad60c
10 changed files with 333 additions and 1 deletions

View File

@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
@ -29,11 +30,14 @@ import org.elasticsearch.index.IndexSettings;
* Factory for ASCIIFoldingFilter.
*/
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory {
public static ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
public static boolean DEFAULT_PRESERVE_ORIGINAL = false;
private final boolean preserveOriginal;
public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
preserveOriginal = settings.getAsBoolean("preserve_original", false);
preserveOriginal = settings.getAsBoolean(PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL);
}
@Override

View File

@ -258,6 +258,7 @@ public final class AnalysisRegistry implements Closeable {
tokenFilters.put("apostrophe", ApostropheFilterFactory::new);
tokenFilters.put("classic", ClassicFilterFactory::new);
tokenFilters.put("decimal_digit", DecimalDigitFilterFactory::new);
tokenFilters.put("fingerprint", FingerprintTokenFilterFactory::new);
}
private void registerBuiltInAnalyzer(Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider>> analyzers) {
@ -304,6 +305,7 @@ public final class AnalysisRegistry implements Closeable {
analyzers.put("swedish", SwedishAnalyzerProvider::new);
analyzers.put("turkish", TurkishAnalyzerProvider::new);
analyzers.put("thai", ThaiAnalyzerProvider::new);
analyzers.put("fingerprint", FingerprintAnalyzerProvider::new);
}
private <T> Map<String, T> buildMapping(boolean analyzer, String toBuild, IndexSettings settings, Map<String, Settings> settingsMap, Map<String, AnalysisModule.AnalysisProvider<T>> providerMap, Map<String, AnalysisModule.AnalysisProvider<T>> defaultInstance) throws IOException {

View File

@ -0,0 +1,56 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
/** OpenRefine Fingerprinting, which uses a Standard tokenizer and lowercase + stop + fingerprint + asciifolding filters */
public final class FingerprintAnalyzer extends Analyzer {
private final char separator;
private final int maxOutputSize;
private final boolean preserveOriginal;
private final CharArraySet stopWords;
public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize, boolean preserveOriginal) {
this.separator = separator;
this.maxOutputSize = maxOutputSize;
this.preserveOriginal = preserveOriginal;
this.stopWords = stopWords;
}
@Override
protected TokenStreamComponents createComponents(String s) {
final Tokenizer tokenizer = new StandardTokenizer();
TokenStream stream = tokenizer;
stream = new LowerCaseFilter(stream);
stream = new StopFilter(stream, stopWords);
stream = new FingerprintFilter(stream, maxOutputSize, separator);
stream = new ASCIIFoldingFilter(stream, preserveOriginal);
return new TokenStreamComponents(tokenizer, stream);
}
}

View File

@ -0,0 +1,60 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
/**
* Builds an OpenRefine Fingerprint analyzer. Uses the default settings from the various components
* (Standard Tokenizer and lowercase + stop + fingerprint + ascii-folding filters)
*/
public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE;
public static ParseField PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.PRESERVE_ORIGINAL;
public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE;
public static boolean DEFAULT_PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.DEFAULT_PRESERVE_ORIGINAL;
public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET;
private final FingerprintAnalyzer analyzer;
public FingerprintAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
char separator = FingerprintTokenFilterFactory.parseSeparator(settings);
int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE);
boolean preserveOriginal = settings.getAsBoolean(PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL);
CharArraySet stopWords = Analysis.parseStopWords(env, settings, DEFAULT_STOP_WORDS);
this.analyzer = new FingerprintAnalyzer(stopWords, separator, maxOutputSize, preserveOriginal);
}
@Override
public FingerprintAnalyzer get() {
return analyzer;
}
}

View File

@ -0,0 +1,69 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
/**
*
*/
public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
private final char separator;
private final int maxOutputSize;
public static ParseField SEPARATOR = new ParseField("separator");
public static ParseField MAX_OUTPUT_SIZE = new ParseField("max_output_size");
public static final char DEFAULT_SEPARATOR = ' ';
public static final int DEFAULT_MAX_OUTPUT_SIZE = 255;
public FingerprintTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.separator = parseSeparator(settings);
this.maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),
FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE);
}
@Override
public TokenStream create(TokenStream tokenStream) {
TokenStream result = tokenStream;
result = new FingerprintFilter(result, maxOutputSize, separator);
return result;
}
public static char parseSeparator(Settings settings) throws IllegalArgumentException {
String customSeparator = settings.get(SEPARATOR.getPreferredName());
if (customSeparator == null) {
return FingerprintTokenFilterFactory.DEFAULT_SEPARATOR;
} else if (customSeparator.length() == 1) {
return customSeparator.charAt(0);
}
throw new IllegalArgumentException("Setting [separator] must be a single, non-null character. ["
+ customSeparator + "] was provided.");
}
}

View File

@ -0,0 +1,68 @@
package org.elasticsearch.index.analysis;
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.elasticsearch.test.ESTokenStreamTestCase;
public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
public void testFingerprint() throws Exception {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO",
new String[]{"bar baz foo"});
}
public void testReusableTokenStream() throws Exception {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO",
new String[]{"bar baz foo"});
assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc",
new String[]{"123.2 abc xyz"});
}
public void testAsciifolding() throws Exception {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
assertAnalyzesTo(a, "gödel escher bach",
new String[]{"bach escher godel"});
}
public void testPreserveOriginal() throws Exception {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true);
assertAnalyzesTo(a, "gödel escher bach",
new String[]{"bach escher godel", "bach escher gödel"});
}
public void testLimit() throws Exception {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 3, false);
assertAnalyzesTo(a, "e d c b a",
new String[]{});
assertAnalyzesTo(a, "b a",
new String[]{"a b"});
}
public void testSeparator() throws Exception {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, '_', 255, true);
assertAnalyzesTo(a, "b c a",
new String[]{"a_b_c"});
}
}

View File

@ -79,5 +79,7 @@ include::analyzers/lang-analyzer.asciidoc[]
include::analyzers/snowball-analyzer.asciidoc[]
include::analyzers/fingerprint-analyzer.asciidoc[]
include::analyzers/custom-analyzer.asciidoc[]

View File

@ -0,0 +1,41 @@
[[analysis-fingerprint-analyzer]]
=== Fingerprint Analyzer
The `fingerprint` analyzer implements a
https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth#fingerprint[fingerprinting algorithm]
which is used by the OpenRefine project to assist in clustering.
The `fingerprint` analyzer is composed of a <<analysis-standard-tokenizer>>, and four
token filters (in this order): <<analysis-lowercase-tokenfilter>>, <<analysis-stop-tokenfilter>>,
<<analysis-fingerprint-tokenfilter>> and <<analysis-asciifolding-tokenfilter>>.
Input text is lowercased, normalized to remove extended characters, sorted, deduplicated and
concatenated into a single token. If a stopword list is configured, stop words will
also be removed. For example, the sentence:
____
"Yes yes, Gödel said this sentence is consistent and."
____
will be transformed into the token: `"and consistent godel is said sentence this yes"`
Notice how the words are all lowercased, the umlaut in "gödel" has been normalized to "godel",
punctuation has been removed, and "yes" has been de-duplicated.
The `fingerprint` analyzer has these configurable settings
[cols="<,<",options="header",]
|=======================================================================
|Setting |Description
|`separator` | The character that separates the tokens after concatenation.
Defaults to a space.
|`max_output_size` | The maximum token size to emit. Defaults to `255`. See <<analysis-fingerprint-tokenfilter-max-size>>
|`preserve_original`| If true, emits both the original and folded version of
tokens that contain extended characters. Defaults to `false`
|`stopwords` | A list of stop words to use. Defaults to an empty list (`_none_`).
|`stopwords_path` | A path (either relative to `config` location, or absolute) to a stopwords
file configuration. Each stop word should be in its own "line" (separated
by a line break). The file must be UTF-8 encoded.
|=======================================================================

View File

@ -86,3 +86,5 @@ include::tokenfilters/classic-tokenfilter.asciidoc[]
include::tokenfilters/apostrophe-tokenfilter.asciidoc[]
include::tokenfilters/decimal-digit-tokenfilter.asciidoc[]
include::tokenfilters/fingerprint-tokenfilter.asciidoc[]

View File

@ -0,0 +1,28 @@
[[analysis-fingerprint-tokenfilter]]
=== Fingerprint Token Filter
The `fingerprint` token filter emits a single token which is useful for fingerprinting
a body of text, and/or providing a token that can be clustered on. It does this by
sorting the tokens, deduplicating and then concatenating them back into a single token.
For example, the tokens `["the", "quick", "quick", "brown", "fox", "was", "very", "brown"]` will be
transformed into a single token: `"brown fox quick the very was"`. Notice how the tokens were sorted
alphabetically, and there is only one `"quick"`.
The following are settings that can be set for a `fingerprint` token
filter type:
[cols="<,<",options="header",]
|======================================================
|Setting |Description
|`separator` |Defaults to a space.
|`max_output_size` |Defaults to `255`.
|======================================================
[[analysis-fingerprint-tokenfilter-max-size]]
==== Maximum token size
Because a field may have many unique tokens, it is important to set a cutoff so that fields do not grow
too large. The `max_output_size` setting controls this behavior. If the concatenated fingerprint
grows larger than `max_output_size`, the token filter will exit and will not emit a token (e.g. the
field will be empty).