Remove the preserve_original option from the FingerprintAnalyzer (#18471)
The preserve_original option to the ASCIIFoldingFilter doesn't play well with the FingerprintFilter, as it ends up producing fingerprints like: "and consistent godel gödel is said sentence this yes" The goal of the OpenRefine algorithm is to product a small normalized ASCII fingerprint. There's no need to expose preserve_original.
This commit is contained in:
parent
8486488627
commit
dc33a83231
|
@ -33,13 +33,11 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
public final class FingerprintAnalyzer extends Analyzer {
|
public final class FingerprintAnalyzer extends Analyzer {
|
||||||
private final char separator;
|
private final char separator;
|
||||||
private final int maxOutputSize;
|
private final int maxOutputSize;
|
||||||
private final boolean preserveOriginal;
|
|
||||||
private final CharArraySet stopWords;
|
private final CharArraySet stopWords;
|
||||||
|
|
||||||
public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize, boolean preserveOriginal) {
|
public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize) {
|
||||||
this.separator = separator;
|
this.separator = separator;
|
||||||
this.maxOutputSize = maxOutputSize;
|
this.maxOutputSize = maxOutputSize;
|
||||||
this.preserveOriginal = preserveOriginal;
|
|
||||||
this.stopWords = stopWords;
|
this.stopWords = stopWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -48,7 +46,7 @@ public final class FingerprintAnalyzer extends Analyzer {
|
||||||
final Tokenizer tokenizer = new StandardTokenizer();
|
final Tokenizer tokenizer = new StandardTokenizer();
|
||||||
TokenStream stream = tokenizer;
|
TokenStream stream = tokenizer;
|
||||||
stream = new LowerCaseFilter(stream);
|
stream = new LowerCaseFilter(stream);
|
||||||
stream = new ASCIIFoldingFilter(stream, preserveOriginal);
|
stream = new ASCIIFoldingFilter(stream, false);
|
||||||
stream = new StopFilter(stream, stopWords);
|
stream = new StopFilter(stream, stopWords);
|
||||||
stream = new FingerprintFilter(stream, maxOutputSize, separator);
|
stream = new FingerprintFilter(stream, maxOutputSize, separator);
|
||||||
return new TokenStreamComponents(tokenizer, stream);
|
return new TokenStreamComponents(tokenizer, stream);
|
||||||
|
|
|
@ -34,10 +34,8 @@ import org.elasticsearch.index.IndexSettings;
|
||||||
public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
|
public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
|
||||||
|
|
||||||
public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE;
|
public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE;
|
||||||
public static ParseField PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.PRESERVE_ORIGINAL;
|
|
||||||
|
|
||||||
public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE;
|
public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE;
|
||||||
public static boolean DEFAULT_PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.DEFAULT_PRESERVE_ORIGINAL;
|
|
||||||
public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET;
|
public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET;
|
||||||
|
|
||||||
private final FingerprintAnalyzer analyzer;
|
private final FingerprintAnalyzer analyzer;
|
||||||
|
@ -47,10 +45,9 @@ public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<A
|
||||||
|
|
||||||
char separator = FingerprintTokenFilterFactory.parseSeparator(settings);
|
char separator = FingerprintTokenFilterFactory.parseSeparator(settings);
|
||||||
int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE);
|
int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE);
|
||||||
boolean preserveOriginal = settings.getAsBoolean(PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL);
|
|
||||||
CharArraySet stopWords = Analysis.parseStopWords(env, settings, DEFAULT_STOP_WORDS);
|
CharArraySet stopWords = Analysis.parseStopWords(env, settings, DEFAULT_STOP_WORDS);
|
||||||
|
|
||||||
this.analyzer = new FingerprintAnalyzer(stopWords, separator, maxOutputSize, preserveOriginal);
|
this.analyzer = new FingerprintAnalyzer(stopWords, separator, maxOutputSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -26,13 +26,13 @@ import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||||
public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
|
public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
|
||||||
|
|
||||||
public void testFingerprint() throws Exception {
|
public void testFingerprint() throws Exception {
|
||||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
|
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
|
||||||
assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO",
|
assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO",
|
||||||
new String[]{"bar baz foo"});
|
new String[]{"bar baz foo"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
|
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
|
||||||
assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO",
|
assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO",
|
||||||
new String[]{"bar baz foo"});
|
new String[]{"bar baz foo"});
|
||||||
assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc",
|
assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc",
|
||||||
|
@ -40,7 +40,7 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testAsciifolding() throws Exception {
|
public void testAsciifolding() throws Exception {
|
||||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
|
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
|
||||||
assertAnalyzesTo(a, "gödel escher bach",
|
assertAnalyzesTo(a, "gödel escher bach",
|
||||||
new String[]{"bach escher godel"});
|
new String[]{"bach escher godel"});
|
||||||
|
|
||||||
|
@ -48,14 +48,8 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
|
||||||
new String[]{"bach escher godel"});
|
new String[]{"bach escher godel"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testPreserveOriginal() throws Exception {
|
|
||||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true);
|
|
||||||
assertAnalyzesTo(a, "gödel escher bach",
|
|
||||||
new String[]{"bach escher godel gödel"});
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testLimit() throws Exception {
|
public void testLimit() throws Exception {
|
||||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 3, false);
|
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 3);
|
||||||
assertAnalyzesTo(a, "e d c b a",
|
assertAnalyzesTo(a, "e d c b a",
|
||||||
new String[]{});
|
new String[]{});
|
||||||
|
|
||||||
|
@ -63,10 +57,4 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
|
||||||
new String[]{"a b"});
|
new String[]{"a b"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSeparator() throws Exception {
|
|
||||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, '_', 255, true);
|
|
||||||
assertAnalyzesTo(a, "b c a",
|
|
||||||
new String[]{"a_b_c"});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue