diff --git a/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc index e3c6c44e196..5e3565cf83c 100644 --- a/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc @@ -32,5 +32,9 @@ no effect. Defaults to `false`. |`token_separator` |The string to use when joining adjacent tokens to form a shingle. Defaults to `" "`. +|`filler_token` | The string to use as a replacement for each position +at which there is no actual token in the stream. For instance this string is +used if the position increment is greater than one when a `stop` filter is used +together with the `shingle` filter. Defaults to `"_"` |======================================================================= diff --git a/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java index b8c1e9e4976..28e4d919845 100644 --- a/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java @@ -42,20 +42,21 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory { Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true); Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false); String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR); - factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator); + String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN); + factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken); } - + @Override public TokenStream create(TokenStream tokenStream) { - return factory.create(tokenStream); + return factory.create(tokenStream); } - - + + public Factory getInnerFactory() { return this.factory; } - + public static final class Factory implements TokenFilterFactory { private final int maxShingleSize; @@ -64,44 +65,47 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory { private final boolean outputUnigramsIfNoShingles; private final String tokenSeparator; + private final String fillerToken; private int minShingleSize; private final String name; - + public Factory(String name) { - this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR); + this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, ShingleFilter.DEFAULT_FILLER_TOKEN); } - - Factory(String name, int minShingleSize, int maxShingleSize, boolean outputUnigrams, boolean outputUnigramsIfNoShingles, String tokenSeparator) { + + Factory(String name, int minShingleSize, int maxShingleSize, boolean outputUnigrams, boolean outputUnigramsIfNoShingles, String tokenSeparator, String fillerToken) { this.maxShingleSize = maxShingleSize; this.outputUnigrams = outputUnigrams; this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; this.tokenSeparator = tokenSeparator; this.minShingleSize = minShingleSize; + this.fillerToken = fillerToken; this.name = name; } - + public TokenStream create(TokenStream tokenStream) { ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize); filter.setOutputUnigrams(outputUnigrams); filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); filter.setTokenSeparator(tokenSeparator); + filter.setFillerToken(fillerToken); return filter; } public int getMaxShingleSize() { return maxShingleSize; } - + public int getMinShingleSize() { return minShingleSize; } - + public boolean getOutputUnigrams() { return outputUnigrams; } - + public boolean getOutputUnigramsIfNoShingles() { return outputUnigramsIfNoShingles; } diff --git a/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java b/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java index 701caded99e..5d0088a866c 100644 --- a/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java @@ -75,9 +75,8 @@ public class ShingleTokenFilterFactoryTests extends ElasticsearchTokenStreamTest AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(RESOURCE); TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle_filler"); String source = "simon the sorcerer"; - String[] expected = new String[]{"simon FILLER sorcerer"}; + String[] expected = new String[]{"simon FILLER", "simon FILLER sorcerer", "FILLER sorcerer"}; TokenStream tokenizer = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(source)), StopFilter.makeStopSet(TEST_VERSION_CURRENT, "the")); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); } - }