mirror of https://github.com/apache/lucene.git
SOLR-744: Added option to ShingleFilterFactory to output unigrams if no shingles can be generated.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1006191 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f9e4f551e2
commit
cd938760da
|
@ -168,6 +168,11 @@ New Features
|
||||||
parameters for controlling the minimum shingle size produced by the filter, and
|
parameters for controlling the minimum shingle size produced by the filter, and
|
||||||
the separator string that it uses, respectively. (Steven Rowe via rmuir)
|
the separator string that it uses, respectively. (Steven Rowe via rmuir)
|
||||||
|
|
||||||
|
* SOLR-744: ShingleFilterFactory supports the "outputUnigramsIfNoShingles"
|
||||||
|
parameter, to output unigrams if the number of input tokens is fewer than
|
||||||
|
minShingleSize, and no shingles can be generated.
|
||||||
|
(Chris Harris via Steven Rowe)
|
||||||
|
|
||||||
* SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now
|
* SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now
|
||||||
supports "percentages" which get evaluated relative the current size of
|
supports "percentages" which get evaluated relative the current size of
|
||||||
the cache when warming happens.
|
the cache when warming happens.
|
||||||
|
|
|
@ -31,6 +31,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory {
|
||||||
private int minShingleSize;
|
private int minShingleSize;
|
||||||
private int maxShingleSize;
|
private int maxShingleSize;
|
||||||
private boolean outputUnigrams;
|
private boolean outputUnigrams;
|
||||||
|
private boolean outputUnigramsIfNoShingles;
|
||||||
private String tokenSeparator;
|
private String tokenSeparator;
|
||||||
|
|
||||||
public void init(Map<String, String> args) {
|
public void init(Map<String, String> args) {
|
||||||
|
@ -56,6 +57,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory {
|
||||||
+ maxShingleSize + ")");
|
+ maxShingleSize + ")");
|
||||||
}
|
}
|
||||||
outputUnigrams = getBoolean("outputUnigrams", true);
|
outputUnigrams = getBoolean("outputUnigrams", true);
|
||||||
|
outputUnigramsIfNoShingles = getBoolean("outputUnigramsIfNoShingles", false);
|
||||||
tokenSeparator = args.containsKey("tokenSeparator")
|
tokenSeparator = args.containsKey("tokenSeparator")
|
||||||
? args.get("tokenSeparator")
|
? args.get("tokenSeparator")
|
||||||
: ShingleFilter.TOKEN_SEPARATOR;
|
: ShingleFilter.TOKEN_SEPARATOR;
|
||||||
|
@ -63,6 +65,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory {
|
||||||
public ShingleFilter create(TokenStream input) {
|
public ShingleFilter create(TokenStream input) {
|
||||||
ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
|
ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
|
||||||
r.setOutputUnigrams(outputUnigrams);
|
r.setOutputUnigrams(outputUnigrams);
|
||||||
|
r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||||
r.setTokenSeparator(tokenSeparator);
|
r.setTokenSeparator(tokenSeparator);
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
|
@ -216,4 +216,23 @@ public class TestShingleFilterFactory extends BaseTokenTestCase {
|
||||||
new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test",
|
new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test",
|
||||||
"is=BLAH=a=BLAH=test", });
|
"is=BLAH=a=BLAH=test", });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test with unigrams disabled except when there are no shingles, with
|
||||||
|
* a single input token. Using default min/max shingle sizes: 2/2. No
|
||||||
|
* shingles will be created, since there are fewer input tokens than
|
||||||
|
* min shingle size. However, because outputUnigramsIfNoShingles is
|
||||||
|
* set to true, even though outputUnigrams is set to false, one
|
||||||
|
* unigram should be output.
|
||||||
|
*/
|
||||||
|
public void testOutputUnigramsIfNoShingles() throws Exception {
|
||||||
|
Reader reader = new StringReader("test");
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("outputUnigrams", "false");
|
||||||
|
args.put("outputUnigramsIfNoShingles", "true");
|
||||||
|
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||||
|
factory.init(args);
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "test" });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue