Remove the preserve_original option from the FingerprintAnalyzer (#18471)

The preserve_original option to the ASCIIFoldingFilter doesn't
play well with the FingerprintFilter, as it ends up producing
fingerprints like:

    "and consistent godel gödel is said sentence this yes"

The goal of the OpenRefine algorithm is to product a small normalized
ASCII fingerprint. There's no need to expose preserve_original.
This commit is contained in:
Clinton Gormley 2016-05-19 19:37:13 +02:00
parent 8486488627
commit dc33a83231
3 changed files with 7 additions and 24 deletions

View File

@ -33,13 +33,11 @@ import org.apache.lucene.analysis.util.CharArraySet;
public final class FingerprintAnalyzer extends Analyzer { public final class FingerprintAnalyzer extends Analyzer {
private final char separator; private final char separator;
private final int maxOutputSize; private final int maxOutputSize;
private final boolean preserveOriginal;
private final CharArraySet stopWords; private final CharArraySet stopWords;
public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize, boolean preserveOriginal) { public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize) {
this.separator = separator; this.separator = separator;
this.maxOutputSize = maxOutputSize; this.maxOutputSize = maxOutputSize;
this.preserveOriginal = preserveOriginal;
this.stopWords = stopWords; this.stopWords = stopWords;
} }
@ -48,7 +46,7 @@ public final class FingerprintAnalyzer extends Analyzer {
final Tokenizer tokenizer = new StandardTokenizer(); final Tokenizer tokenizer = new StandardTokenizer();
TokenStream stream = tokenizer; TokenStream stream = tokenizer;
stream = new LowerCaseFilter(stream); stream = new LowerCaseFilter(stream);
stream = new ASCIIFoldingFilter(stream, preserveOriginal); stream = new ASCIIFoldingFilter(stream, false);
stream = new StopFilter(stream, stopWords); stream = new StopFilter(stream, stopWords);
stream = new FingerprintFilter(stream, maxOutputSize, separator); stream = new FingerprintFilter(stream, maxOutputSize, separator);
return new TokenStreamComponents(tokenizer, stream); return new TokenStreamComponents(tokenizer, stream);

View File

@ -34,10 +34,8 @@ import org.elasticsearch.index.IndexSettings;
public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> { public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE; public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE;
public static ParseField PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.PRESERVE_ORIGINAL;
public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE; public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE;
public static boolean DEFAULT_PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.DEFAULT_PRESERVE_ORIGINAL;
public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET; public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET;
private final FingerprintAnalyzer analyzer; private final FingerprintAnalyzer analyzer;
@ -47,10 +45,9 @@ public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<A
char separator = FingerprintTokenFilterFactory.parseSeparator(settings); char separator = FingerprintTokenFilterFactory.parseSeparator(settings);
int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE); int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE);
boolean preserveOriginal = settings.getAsBoolean(PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL);
CharArraySet stopWords = Analysis.parseStopWords(env, settings, DEFAULT_STOP_WORDS); CharArraySet stopWords = Analysis.parseStopWords(env, settings, DEFAULT_STOP_WORDS);
this.analyzer = new FingerprintAnalyzer(stopWords, separator, maxOutputSize, preserveOriginal); this.analyzer = new FingerprintAnalyzer(stopWords, separator, maxOutputSize);
} }
@Override @Override

View File

@ -26,13 +26,13 @@ import org.elasticsearch.test.ESTokenStreamTestCase;
public class FingerprintAnalyzerTests extends ESTokenStreamTestCase { public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
public void testFingerprint() throws Exception { public void testFingerprint() throws Exception {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false); Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO", assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO",
new String[]{"bar baz foo"}); new String[]{"bar baz foo"});
} }
public void testReusableTokenStream() throws Exception { public void testReusableTokenStream() throws Exception {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false); Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO", assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO",
new String[]{"bar baz foo"}); new String[]{"bar baz foo"});
assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc", assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc",
@ -40,7 +40,7 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
} }
public void testAsciifolding() throws Exception { public void testAsciifolding() throws Exception {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false); Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
assertAnalyzesTo(a, "gödel escher bach", assertAnalyzesTo(a, "gödel escher bach",
new String[]{"bach escher godel"}); new String[]{"bach escher godel"});
@ -48,14 +48,8 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
new String[]{"bach escher godel"}); new String[]{"bach escher godel"});
} }
public void testPreserveOriginal() throws Exception {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true);
assertAnalyzesTo(a, "gödel escher bach",
new String[]{"bach escher godel gödel"});
}
public void testLimit() throws Exception { public void testLimit() throws Exception {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 3, false); Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 3);
assertAnalyzesTo(a, "e d c b a", assertAnalyzesTo(a, "e d c b a",
new String[]{}); new String[]{});
@ -63,10 +57,4 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
new String[]{"a b"}); new String[]{"a b"});
} }
public void testSeparator() throws Exception {
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, '_', 255, true);
assertAnalyzesTo(a, "b c a",
new String[]{"a_b_c"});
}
} }