From cd938760da769a40108f05c178f41c79f8c92339 Mon Sep 17 00:00:00 2001 From: Steven Rowe Date: Sat, 9 Oct 2010 16:56:43 +0000 Subject: [PATCH] SOLR-744: Added option to ShingleFilterFactory to output unigrams if no shingles can be generated. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1006191 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 5 +++++ .../solr/analysis/ShingleFilterFactory.java | 3 +++ .../analysis/TestShingleFilterFactory.java | 19 +++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 31510ff42c1..b657076ee35 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -168,6 +168,11 @@ New Features parameters for controlling the minimum shingle size produced by the filter, and the separator string that it uses, respectively. (Steven Rowe via rmuir) +* SOLR-744: ShingleFilterFactory supports the "outputUnigramsIfNoShingles" + parameter, to output unigrams if the number of input tokens is fewer than + minShingleSize, and no shingles can be generated. + (Chris Harris via Steven Rowe) + * SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now supports "percentages" which get evaluated relative the current size of the cache when warming happens. diff --git a/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java index bf8bdb23f1d..c158d33ae03 100644 --- a/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java @@ -31,6 +31,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory { private int minShingleSize; private int maxShingleSize; private boolean outputUnigrams; + private boolean outputUnigramsIfNoShingles; private String tokenSeparator; public void init(Map args) { @@ -56,6 +57,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory { + maxShingleSize + ")"); } outputUnigrams = getBoolean("outputUnigrams", true); + outputUnigramsIfNoShingles = getBoolean("outputUnigramsIfNoShingles", false); tokenSeparator = args.containsKey("tokenSeparator") ? args.get("tokenSeparator") : ShingleFilter.TOKEN_SEPARATOR; @@ -63,6 +65,7 @@ public class ShingleFilterFactory extends BaseTokenFilterFactory { public ShingleFilter create(TokenStream input) { ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize); r.setOutputUnigrams(outputUnigrams); + r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); r.setTokenSeparator(tokenSeparator); return r; } diff --git a/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java index 1f30feea724..de0a1fa01e6 100644 --- a/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java @@ -216,4 +216,23 @@ public class TestShingleFilterFactory extends BaseTokenTestCase { new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test", "is=BLAH=a=BLAH=test", }); } + + /** + * Test with unigrams disabled except when there are no shingles, with + * a single input token. Using default min/max shingle sizes: 2/2. No + * shingles will be created, since there are fewer input tokens than + * min shingle size. However, because outputUnigramsIfNoShingles is + * set to true, even though outputUnigrams is set to false, one + * unigram should be output. + */ + public void testOutputUnigramsIfNoShingles() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + args.put("outputUnigrams", "false"); + args.put("outputUnigramsIfNoShingles", "true"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "test" }); + } }