Add `fingerprint` token filter and `fingerprint` analyzer
Adds a `fingerprint` token filter which uses Lucene's FingerprintFilter, and a `fingerprint` analyzer that combines the Fingerprint filter with lowercasing, stop word removal and asciifolding. Closes #13325
This commit is contained in:
parent
1f5fd3094f
commit
80288ad60c
|
@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
@ -29,11 +30,14 @@ import org.elasticsearch.index.IndexSettings;
|
|||
* Factory for ASCIIFoldingFilter.
|
||||
*/
|
||||
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
public static ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
|
||||
public static boolean DEFAULT_PRESERVE_ORIGINAL = false;
|
||||
|
||||
private final boolean preserveOriginal;
|
||||
|
||||
public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
preserveOriginal = settings.getAsBoolean("preserve_original", false);
|
||||
preserveOriginal = settings.getAsBoolean(PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -258,6 +258,7 @@ public final class AnalysisRegistry implements Closeable {
|
|||
tokenFilters.put("apostrophe", ApostropheFilterFactory::new);
|
||||
tokenFilters.put("classic", ClassicFilterFactory::new);
|
||||
tokenFilters.put("decimal_digit", DecimalDigitFilterFactory::new);
|
||||
tokenFilters.put("fingerprint", FingerprintTokenFilterFactory::new);
|
||||
}
|
||||
|
||||
private void registerBuiltInAnalyzer(Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider>> analyzers) {
|
||||
|
@ -304,6 +305,7 @@ public final class AnalysisRegistry implements Closeable {
|
|||
analyzers.put("swedish", SwedishAnalyzerProvider::new);
|
||||
analyzers.put("turkish", TurkishAnalyzerProvider::new);
|
||||
analyzers.put("thai", ThaiAnalyzerProvider::new);
|
||||
analyzers.put("fingerprint", FingerprintAnalyzerProvider::new);
|
||||
}
|
||||
|
||||
private <T> Map<String, T> buildMapping(boolean analyzer, String toBuild, IndexSettings settings, Map<String, Settings> settingsMap, Map<String, AnalysisModule.AnalysisProvider<T>> providerMap, Map<String, AnalysisModule.AnalysisProvider<T>> defaultInstance) throws IOException {
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/** OpenRefine Fingerprinting, which uses a Standard tokenizer and lowercase + stop + fingerprint + asciifolding filters */
|
||||
public final class FingerprintAnalyzer extends Analyzer {
|
||||
private final char separator;
|
||||
private final int maxOutputSize;
|
||||
private final boolean preserveOriginal;
|
||||
private final CharArraySet stopWords;
|
||||
|
||||
public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize, boolean preserveOriginal) {
|
||||
this.separator = separator;
|
||||
this.maxOutputSize = maxOutputSize;
|
||||
this.preserveOriginal = preserveOriginal;
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String s) {
|
||||
final Tokenizer tokenizer = new StandardTokenizer();
|
||||
TokenStream stream = tokenizer;
|
||||
stream = new LowerCaseFilter(stream);
|
||||
stream = new StopFilter(stream, stopWords);
|
||||
stream = new FingerprintFilter(stream, maxOutputSize, separator);
|
||||
stream = new ASCIIFoldingFilter(stream, preserveOriginal);
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
||||
|
||||
/**
|
||||
* Builds an OpenRefine Fingerprint analyzer. Uses the default settings from the various components
|
||||
* (Standard Tokenizer and lowercase + stop + fingerprint + ascii-folding filters)
|
||||
*/
|
||||
public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
|
||||
|
||||
public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE;
|
||||
public static ParseField PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.PRESERVE_ORIGINAL;
|
||||
|
||||
public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE;
|
||||
public static boolean DEFAULT_PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.DEFAULT_PRESERVE_ORIGINAL;
|
||||
public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET;
|
||||
|
||||
private final FingerprintAnalyzer analyzer;
|
||||
|
||||
public FingerprintAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
||||
char separator = FingerprintTokenFilterFactory.parseSeparator(settings);
|
||||
int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE);
|
||||
boolean preserveOriginal = settings.getAsBoolean(PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL);
|
||||
CharArraySet stopWords = Analysis.parseStopWords(env, settings, DEFAULT_STOP_WORDS);
|
||||
|
||||
this.analyzer = new FingerprintAnalyzer(stopWords, separator, maxOutputSize, preserveOriginal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FingerprintAnalyzer get() {
|
||||
return analyzer;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private final char separator;
|
||||
private final int maxOutputSize;
|
||||
|
||||
public static ParseField SEPARATOR = new ParseField("separator");
|
||||
public static ParseField MAX_OUTPUT_SIZE = new ParseField("max_output_size");
|
||||
|
||||
public static final char DEFAULT_SEPARATOR = ' ';
|
||||
public static final int DEFAULT_MAX_OUTPUT_SIZE = 255;
|
||||
|
||||
public FingerprintTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
this.separator = parseSeparator(settings);
|
||||
this.maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),
|
||||
FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
TokenStream result = tokenStream;
|
||||
result = new FingerprintFilter(result, maxOutputSize, separator);
|
||||
return result;
|
||||
}
|
||||
|
||||
public static char parseSeparator(Settings settings) throws IllegalArgumentException {
|
||||
String customSeparator = settings.get(SEPARATOR.getPreferredName());
|
||||
if (customSeparator == null) {
|
||||
return FingerprintTokenFilterFactory.DEFAULT_SEPARATOR;
|
||||
} else if (customSeparator.length() == 1) {
|
||||
return customSeparator.charAt(0);
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Setting [separator] must be a single, non-null character. ["
|
||||
+ customSeparator + "] was provided.");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
package org.elasticsearch.index.analysis;
|
||||
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||
|
||||
public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
|
||||
|
||||
public void testFingerprint() throws Exception {
|
||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
|
||||
assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO",
|
||||
new String[]{"bar baz foo"});
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
|
||||
assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO",
|
||||
new String[]{"bar baz foo"});
|
||||
assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc",
|
||||
new String[]{"123.2 abc xyz"});
|
||||
}
|
||||
|
||||
public void testAsciifolding() throws Exception {
|
||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
|
||||
assertAnalyzesTo(a, "gödel escher bach",
|
||||
new String[]{"bach escher godel"});
|
||||
}
|
||||
|
||||
public void testPreserveOriginal() throws Exception {
|
||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true);
|
||||
assertAnalyzesTo(a, "gödel escher bach",
|
||||
new String[]{"bach escher godel", "bach escher gödel"});
|
||||
}
|
||||
|
||||
public void testLimit() throws Exception {
|
||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 3, false);
|
||||
assertAnalyzesTo(a, "e d c b a",
|
||||
new String[]{});
|
||||
|
||||
assertAnalyzesTo(a, "b a",
|
||||
new String[]{"a b"});
|
||||
}
|
||||
|
||||
public void testSeparator() throws Exception {
|
||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, '_', 255, true);
|
||||
assertAnalyzesTo(a, "b c a",
|
||||
new String[]{"a_b_c"});
|
||||
}
|
||||
}
|
|
@ -79,5 +79,7 @@ include::analyzers/lang-analyzer.asciidoc[]
|
|||
|
||||
include::analyzers/snowball-analyzer.asciidoc[]
|
||||
|
||||
include::analyzers/fingerprint-analyzer.asciidoc[]
|
||||
|
||||
include::analyzers/custom-analyzer.asciidoc[]
|
||||
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
[[analysis-fingerprint-analyzer]]
|
||||
=== Fingerprint Analyzer
|
||||
|
||||
The `fingerprint` analyzer implements a
|
||||
https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth#fingerprint[fingerprinting algorithm]
|
||||
which is used by the OpenRefine project to assist in clustering.
|
||||
|
||||
The `fingerprint` analyzer is composed of a <<analysis-standard-tokenizer>>, and four
|
||||
token filters (in this order): <<analysis-lowercase-tokenfilter>>, <<analysis-stop-tokenfilter>>,
|
||||
<<analysis-fingerprint-tokenfilter>> and <<analysis-asciifolding-tokenfilter>>.
|
||||
|
||||
Input text is lowercased, normalized to remove extended characters, sorted, deduplicated and
|
||||
concatenated into a single token. If a stopword list is configured, stop words will
|
||||
also be removed. For example, the sentence:
|
||||
|
||||
____
|
||||
"Yes yes, Gödel said this sentence is consistent and."
|
||||
____
|
||||
|
||||
will be transformed into the token: `"and consistent godel is said sentence this yes"`
|
||||
|
||||
|
||||
Notice how the words are all lowercased, the umlaut in "gödel" has been normalized to "godel",
|
||||
punctuation has been removed, and "yes" has been de-duplicated.
|
||||
|
||||
The `fingerprint` analyzer has these configurable settings
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|=======================================================================
|
||||
|Setting |Description
|
||||
|`separator` | The character that separates the tokens after concatenation.
|
||||
Defaults to a space.
|
||||
|`max_output_size` | The maximum token size to emit. Defaults to `255`. See <<analysis-fingerprint-tokenfilter-max-size>>
|
||||
|`preserve_original`| If true, emits both the original and folded version of
|
||||
tokens that contain extended characters. Defaults to `false`
|
||||
|`stopwords` | A list of stop words to use. Defaults to an empty list (`_none_`).
|
||||
|`stopwords_path` | A path (either relative to `config` location, or absolute) to a stopwords
|
||||
file configuration. Each stop word should be in its own "line" (separated
|
||||
by a line break). The file must be UTF-8 encoded.
|
||||
|=======================================================================
|
||||
|
|
@ -86,3 +86,5 @@ include::tokenfilters/classic-tokenfilter.asciidoc[]
|
|||
include::tokenfilters/apostrophe-tokenfilter.asciidoc[]
|
||||
|
||||
include::tokenfilters/decimal-digit-tokenfilter.asciidoc[]
|
||||
|
||||
include::tokenfilters/fingerprint-tokenfilter.asciidoc[]
|
|
@ -0,0 +1,28 @@
|
|||
[[analysis-fingerprint-tokenfilter]]
|
||||
=== Fingerprint Token Filter
|
||||
|
||||
The `fingerprint` token filter emits a single token which is useful for fingerprinting
|
||||
a body of text, and/or providing a token that can be clustered on. It does this by
|
||||
sorting the tokens, deduplicating and then concatenating them back into a single token.
|
||||
|
||||
For example, the tokens `["the", "quick", "quick", "brown", "fox", "was", "very", "brown"]` will be
|
||||
transformed into a single token: `"brown fox quick the very was"`. Notice how the tokens were sorted
|
||||
alphabetically, and there is only one `"quick"`.
|
||||
|
||||
The following are settings that can be set for a `fingerprint` token
|
||||
filter type:
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|======================================================
|
||||
|Setting |Description
|
||||
|`separator` |Defaults to a space.
|
||||
|`max_output_size` |Defaults to `255`.
|
||||
|======================================================
|
||||
|
||||
[[analysis-fingerprint-tokenfilter-max-size]]
|
||||
==== Maximum token size
|
||||
|
||||
Because a field may have many unique tokens, it is important to set a cutoff so that fields do not grow
|
||||
too large. The `max_output_size` setting controls this behavior. If the concatenated fingerprint
|
||||
grows larger than `max_output_size`, the token filter will exit and will not emit a token (e.g. the
|
||||
field will be empty).
|
Loading…
Reference in New Issue