Expose `filler_token` via ShingleTokenFilterFactory
Lucene 4.7 supports a setter for the `filler_token` that is inserted if there are gaps in the token stream. This change exposes this setting. Closes #4307
This commit is contained in:
parent
30d7b8de2f
commit
9160516b28
|
@ -32,5 +32,9 @@ no effect. Defaults to `false`.
|
|||
|
||||
|`token_separator` |The string to use when joining adjacent tokens to
|
||||
form a shingle. Defaults to `" "`.
|
||||
|`filler_token` | The string to use as a replacement for each position
|
||||
at which there is no actual token in the stream. For instance this string is
|
||||
used if the position increment is greater than one when a `stop` filter is used
|
||||
together with the `shingle` filter. Defaults to `"_"`
|
||||
|=======================================================================
|
||||
|
||||
|
|
|
@ -42,20 +42,21 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
|
||||
Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
|
||||
String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
|
||||
factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator);
|
||||
String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return factory.create(tokenStream);
|
||||
return factory.create(tokenStream);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public Factory getInnerFactory() {
|
||||
return this.factory;
|
||||
}
|
||||
|
||||
|
||||
public static final class Factory implements TokenFilterFactory {
|
||||
private final int maxShingleSize;
|
||||
|
||||
|
@ -64,44 +65,47 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
private final boolean outputUnigramsIfNoShingles;
|
||||
|
||||
private final String tokenSeparator;
|
||||
private final String fillerToken;
|
||||
|
||||
private int minShingleSize;
|
||||
|
||||
private final String name;
|
||||
|
||||
|
||||
public Factory(String name) {
|
||||
this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
|
||||
this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
}
|
||||
|
||||
Factory(String name, int minShingleSize, int maxShingleSize, boolean outputUnigrams, boolean outputUnigramsIfNoShingles, String tokenSeparator) {
|
||||
|
||||
Factory(String name, int minShingleSize, int maxShingleSize, boolean outputUnigrams, boolean outputUnigramsIfNoShingles, String tokenSeparator, String fillerToken) {
|
||||
this.maxShingleSize = maxShingleSize;
|
||||
this.outputUnigrams = outputUnigrams;
|
||||
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
|
||||
this.tokenSeparator = tokenSeparator;
|
||||
this.minShingleSize = minShingleSize;
|
||||
this.fillerToken = fillerToken;
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||
filter.setTokenSeparator(tokenSeparator);
|
||||
filter.setFillerToken(fillerToken);
|
||||
return filter;
|
||||
}
|
||||
|
||||
public int getMaxShingleSize() {
|
||||
return maxShingleSize;
|
||||
}
|
||||
|
||||
|
||||
public int getMinShingleSize() {
|
||||
return minShingleSize;
|
||||
}
|
||||
|
||||
|
||||
public boolean getOutputUnigrams() {
|
||||
return outputUnigrams;
|
||||
}
|
||||
|
||||
|
||||
public boolean getOutputUnigramsIfNoShingles() {
|
||||
return outputUnigramsIfNoShingles;
|
||||
}
|
||||
|
|
|
@ -75,9 +75,8 @@ public class ShingleTokenFilterFactoryTests extends ElasticsearchTokenStreamTest
|
|||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(RESOURCE);
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle_filler");
|
||||
String source = "simon the sorcerer";
|
||||
String[] expected = new String[]{"simon FILLER sorcerer"};
|
||||
String[] expected = new String[]{"simon FILLER", "simon FILLER sorcerer", "FILLER sorcerer"};
|
||||
TokenStream tokenizer = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(source)), StopFilter.makeStopSet(TEST_VERSION_CURRENT, "the"));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue