Expose `filler_token` via ShingleTokenFilterFactory

Lucene 4.7 supports a setter for the `filler_token` that is
inserted if there are gaps in the token stream. This change exposes
this setting.

Closes #4307
This commit is contained in:
Simon Willnauer 2014-02-19 12:15:59 +01:00 committed by Adrien Grand
parent 30d7b8de2f
commit 9160516b28
3 changed files with 23 additions and 16 deletions

View File

@ -32,5 +32,9 @@ no effect. Defaults to `false`.
|`token_separator` |The string to use when joining adjacent tokens to
form a shingle. Defaults to `" "`.
|`filler_token` | The string to use as a replacement for each position
at which there is no actual token in the stream. For instance this string is
used if the position increment is greater than one when a `stop` filter is used
together with the `shingle` filter. Defaults to `"_"`
|=======================================================================

View File

@ -42,20 +42,21 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator);
String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);
factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken);
}
@Override
public TokenStream create(TokenStream tokenStream) {
return factory.create(tokenStream);
return factory.create(tokenStream);
}
public Factory getInnerFactory() {
return this.factory;
}
public static final class Factory implements TokenFilterFactory {
private final int maxShingleSize;
@ -64,44 +65,47 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
private final boolean outputUnigramsIfNoShingles;
private final String tokenSeparator;
private final String fillerToken;
private int minShingleSize;
private final String name;
public Factory(String name) {
this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, ShingleFilter.DEFAULT_FILLER_TOKEN);
}
Factory(String name, int minShingleSize, int maxShingleSize, boolean outputUnigrams, boolean outputUnigramsIfNoShingles, String tokenSeparator) {
Factory(String name, int minShingleSize, int maxShingleSize, boolean outputUnigrams, boolean outputUnigramsIfNoShingles, String tokenSeparator, String fillerToken) {
this.maxShingleSize = maxShingleSize;
this.outputUnigrams = outputUnigrams;
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
this.tokenSeparator = tokenSeparator;
this.minShingleSize = minShingleSize;
this.fillerToken = fillerToken;
this.name = name;
}
public TokenStream create(TokenStream tokenStream) {
ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
filter.setTokenSeparator(tokenSeparator);
filter.setFillerToken(fillerToken);
return filter;
}
public int getMaxShingleSize() {
return maxShingleSize;
}
public int getMinShingleSize() {
return minShingleSize;
}
public boolean getOutputUnigrams() {
return outputUnigrams;
}
public boolean getOutputUnigramsIfNoShingles() {
return outputUnigramsIfNoShingles;
}

View File

@ -75,9 +75,8 @@ public class ShingleTokenFilterFactoryTests extends ElasticsearchTokenStreamTest
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(RESOURCE);
TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle_filler");
String source = "simon the sorcerer";
String[] expected = new String[]{"simon FILLER sorcerer"};
String[] expected = new String[]{"simon FILLER", "simon FILLER sorcerer", "FILLER sorcerer"};
TokenStream tokenizer = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(source)), StopFilter.makeStopSet(TEST_VERSION_CURRENT, "the"));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
}