SOLR-744: Added option to ShingleFilterFactory to output unigrams if no shingles can be generated.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1006191 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2010-10-09 16:56:43 +00:00
parent f9e4f551e2
commit cd938760da
3 changed files with 27 additions and 0 deletions

View File

@ -168,6 +168,11 @@ New Features
parameters for controlling the minimum shingle size produced by the filter, and parameters for controlling the minimum shingle size produced by the filter, and
the separator string that it uses, respectively. (Steven Rowe via rmuir) the separator string that it uses, respectively. (Steven Rowe via rmuir)
* SOLR-744: ShingleFilterFactory supports the "outputUnigramsIfNoShingles"
parameter, to output unigrams if the number of input tokens is fewer than
minShingleSize, and no shingles can be generated.
(Chris Harris via Steven Rowe)
* SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now * SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now
supports "percentages" which get evaluated relative the current size of supports "percentages" which get evaluated relative the current size of
the cache when warming happens. the cache when warming happens.

View File

@ -31,6 +31,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory {
private int minShingleSize; private int minShingleSize;
private int maxShingleSize; private int maxShingleSize;
private boolean outputUnigrams; private boolean outputUnigrams;
private boolean outputUnigramsIfNoShingles;
private String tokenSeparator; private String tokenSeparator;
public void init(Map<String, String> args) { public void init(Map<String, String> args) {
@ -56,6 +57,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory {
+ maxShingleSize + ")"); + maxShingleSize + ")");
} }
outputUnigrams = getBoolean("outputUnigrams", true); outputUnigrams = getBoolean("outputUnigrams", true);
outputUnigramsIfNoShingles = getBoolean("outputUnigramsIfNoShingles", false);
tokenSeparator = args.containsKey("tokenSeparator") tokenSeparator = args.containsKey("tokenSeparator")
? args.get("tokenSeparator") ? args.get("tokenSeparator")
: ShingleFilter.TOKEN_SEPARATOR; : ShingleFilter.TOKEN_SEPARATOR;
@ -63,6 +65,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory {
public ShingleFilter create(TokenStream input) { public ShingleFilter create(TokenStream input) {
ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize); ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
r.setOutputUnigrams(outputUnigrams); r.setOutputUnigrams(outputUnigrams);
r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
r.setTokenSeparator(tokenSeparator); r.setTokenSeparator(tokenSeparator);
return r; return r;
} }

View File

@ -216,4 +216,23 @@ public class TestShingleFilterFactory extends BaseTokenTestCase {
new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test", new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test",
"is=BLAH=a=BLAH=test", }); "is=BLAH=a=BLAH=test", });
} }
/**
* Test with unigrams disabled except when there are no shingles, with
* a single input token. Using default min/max shingle sizes: 2/2. No
* shingles will be created, since there are fewer input tokens than
* min shingle size. However, because outputUnigramsIfNoShingles is
* set to true, even though outputUnigrams is set to false, one
* unigram should be output.
*/
public void testOutputUnigramsIfNoShingles() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
args.put("outputUnigrams", "false");
args.put("outputUnigramsIfNoShingles", "true");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "test" });
}
} }