mirror of https://github.com/apache/lucene.git
SOLR-744: Added option to ShingleFilterFactory to output unigrams if no shingles can be generated.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1006191 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f9e4f551e2
commit
cd938760da
|
@ -168,6 +168,11 @@ New Features
|
|||
parameters for controlling the minimum shingle size produced by the filter, and
|
||||
the separator string that it uses, respectively. (Steven Rowe via rmuir)
|
||||
|
||||
* SOLR-744: ShingleFilterFactory supports the "outputUnigramsIfNoShingles"
|
||||
parameter, to output unigrams if the number of input tokens is fewer than
|
||||
minShingleSize, and no shingles can be generated.
|
||||
(Chris Harris via Steven Rowe)
|
||||
|
||||
* SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now
|
||||
supports "percentages" which get evaluated relative the current size of
|
||||
the cache when warming happens.
|
||||
|
|
|
@ -31,6 +31,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory {
|
|||
private int minShingleSize;
|
||||
private int maxShingleSize;
|
||||
private boolean outputUnigrams;
|
||||
private boolean outputUnigramsIfNoShingles;
|
||||
private String tokenSeparator;
|
||||
|
||||
public void init(Map<String, String> args) {
|
||||
|
@ -56,6 +57,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory {
|
|||
+ maxShingleSize + ")");
|
||||
}
|
||||
outputUnigrams = getBoolean("outputUnigrams", true);
|
||||
outputUnigramsIfNoShingles = getBoolean("outputUnigramsIfNoShingles", false);
|
||||
tokenSeparator = args.containsKey("tokenSeparator")
|
||||
? args.get("tokenSeparator")
|
||||
: ShingleFilter.TOKEN_SEPARATOR;
|
||||
|
@ -63,6 +65,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory {
|
|||
public ShingleFilter create(TokenStream input) {
|
||||
ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
|
||||
r.setOutputUnigrams(outputUnigrams);
|
||||
r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||
r.setTokenSeparator(tokenSeparator);
|
||||
return r;
|
||||
}
|
||||
|
|
|
@ -216,4 +216,23 @@ public class TestShingleFilterFactory extends BaseTokenTestCase {
|
|||
new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test",
|
||||
"is=BLAH=a=BLAH=test", });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test with unigrams disabled except when there are no shingles, with
|
||||
* a single input token. Using default min/max shingle sizes: 2/2. No
|
||||
* shingles will be created, since there are fewer input tokens than
|
||||
* min shingle size. However, because outputUnigramsIfNoShingles is
|
||||
* set to true, even though outputUnigrams is set to false, one
|
||||
* unigram should be output.
|
||||
*/
|
||||
public void testOutputUnigramsIfNoShingles() throws Exception {
|
||||
Reader reader = new StringReader("test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("outputUnigrams", "false");
|
||||
args.put("outputUnigramsIfNoShingles", "true");
|
||||
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||
assertTokenStreamContents(stream, new String[] { "test" });
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue