SOLR-744: Added option to ShingleFilterFactory to output unigrams if no shingles can be generated.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1006191 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2010-10-09 16:56:43 +00:00
parent f9e4f551e2
commit cd938760da
3 changed files with 27 additions and 0 deletions

View File

@ -168,6 +168,11 @@ New Features
parameters for controlling the minimum shingle size produced by the filter, and
the separator string that it uses, respectively. (Steven Rowe via rmuir)
* SOLR-744: ShingleFilterFactory supports the "outputUnigramsIfNoShingles"
parameter, to output unigrams if the number of input tokens is fewer than
minShingleSize, and no shingles can be generated.
(Chris Harris via Steven Rowe)
* SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now
supports "percentages" which get evaluated relative the current size of
the cache when warming happens.

View File

@ -31,6 +31,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory {
private int minShingleSize;
private int maxShingleSize;
private boolean outputUnigrams;
private boolean outputUnigramsIfNoShingles;
private String tokenSeparator;
public void init(Map<String, String> args) {
@ -56,6 +57,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory {
+ maxShingleSize + ")");
}
outputUnigrams = getBoolean("outputUnigrams", true);
outputUnigramsIfNoShingles = getBoolean("outputUnigramsIfNoShingles", false);
tokenSeparator = args.containsKey("tokenSeparator")
? args.get("tokenSeparator")
: ShingleFilter.TOKEN_SEPARATOR;
@ -63,6 +65,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory {
public ShingleFilter create(TokenStream input) {
ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
r.setOutputUnigrams(outputUnigrams);
r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
r.setTokenSeparator(tokenSeparator);
return r;
}

View File

@ -216,4 +216,23 @@ public class TestShingleFilterFactory extends BaseTokenTestCase {
new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test",
"is=BLAH=a=BLAH=test", });
}
/**
* Test with unigrams disabled except when there are no shingles, with
* a single input token. Using default min/max shingle sizes: 2/2. No
* shingles will be created, since there are fewer input tokens than
* min shingle size. However, because outputUnigramsIfNoShingles is
* set to true, even though outputUnigrams is set to false, one
* unigram should be output.
*/
public void testOutputUnigramsIfNoShingles() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
args.put("outputUnigrams", "false");
args.put("outputUnigramsIfNoShingles", "true");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "test" });
}
}