diff --git a/modules/analysis/CHANGES.txt b/modules/analysis/CHANGES.txt index 0b04badf96b..4180236bfc6 100644 --- a/modules/analysis/CHANGES.txt +++ b/modules/analysis/CHANGES.txt @@ -15,6 +15,9 @@ API Changes RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer behavior. (Steven Rowe, Robert Muir, Uwe Schindler) + * LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles + can be generated. (Chris Harris via Steven Rowe) + New Features * LUCENE-2413: Consolidated Solr analysis components into common. diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java index 8349eeb98c4..be60d2cc986 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java @@ -38,6 +38,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer { private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE; private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR; private boolean outputUnigrams = true; + private boolean outputUnigramsIfNoShingles = false; public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) { super(); @@ -147,6 +148,24 @@ public final class ShingleAnalyzerWrapper extends Analyzer { public void setOutputUnigrams(boolean outputUnigrams) { this.outputUnigrams = outputUnigrams; } + + public boolean isOutputUnigramsIfNoShingles() { + return outputUnigramsIfNoShingles; + } + + /** + *

Shall we override the behavior of outputUnigrams==false for those + * times when no shingles are available (because there are fewer than + * minShingleSize tokens in the input stream)? (default: false.) + *

Note that if outputUnigrams==true, then unigrams are always output, + * regardless of whether any shingles are available. + * + * @param outputUnigramsIfNoShingles Whether or not to output a single + * unigram when no shingles are available. + */ + public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) { + this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; + } @Override public TokenStream tokenStream(String fieldName, Reader reader) { @@ -161,6 +180,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer { filter.setMaxShingleSize(maxShingleSize); filter.setTokenSeparator(tokenSeparator); filter.setOutputUnigrams(outputUnigrams); + filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); return filter; } @@ -192,6 +212,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer { streams.shingle.setMinShingleSize(minShingleSize); streams.shingle.setTokenSeparator(tokenSeparator); streams.shingle.setOutputUnigrams(outputUnigrams); + streams.shingle.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); return streams.shingle; } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java index cccd8cd1c33..268f05711ad 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java @@ -102,6 +102,11 @@ public final class ShingleFilter extends TokenFilter { */ private boolean outputUnigrams = true; + /** + * By default, we don't override behavior of outputUnigrams. + */ + private boolean outputUnigramsIfNoShingles = false; + /** * maximum shingle size (number of tokens) */ @@ -136,6 +141,11 @@ public final class ShingleFilter extends TokenFilter { * position. */ private boolean isOutputHere = false; + + /** + * true if no shingles have been output yet (for outputUnigramsIfNoShingles). + */ + boolean noShingleOutput = true; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); @@ -211,6 +221,20 @@ public final class ShingleFilter extends TokenFilter { gramSize = new CircularSequence(); } + /** + *

Shall we override the behavior of outputUnigrams==false for those + * times when no shingles are available (because there are fewer than + * minShingleSize tokens in the input stream)? (default: false.) + *

Note that if outputUnigrams==true, then unigrams are always output, + * regardless of whether any shingles are available. + * + * @param outputUnigramsIfNoShingles Whether or not to output a single + * unigram when no shingles are available. + */ + public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) { + this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; + } + /** * Set the max shingle size (default: 2) * @@ -292,6 +316,7 @@ public final class ShingleFilter extends TokenFilter { termAtt.setEmpty().append(gramBuilder); if (gramSize.getValue() > 1) { typeAtt.setType(tokenType); + noShingleOutput = false; } offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); isOutputHere = true; @@ -395,6 +420,10 @@ public final class ShingleFilter extends TokenFilter { } } } + if (outputUnigramsIfNoShingles && noShingleOutput + && gramSize.minValue > 1 && inputWindow.size() < minShingleSize) { + gramSize.minValue = 1; + } gramSize.reset(); isOutputHere = false; } @@ -406,6 +435,11 @@ public final class ShingleFilter extends TokenFilter { inputWindow.clear(); numFillerTokensToInsert = 0; isOutputHere = false; + noShingleOutput = true; + if (outputUnigramsIfNoShingles && ! outputUnigrams) { + // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles + gramSize.minValue = minShingleSize; + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java index 29c3a0f7b74..7b9762e101a 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java @@ -359,4 +359,16 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { new int[] { 13, 18, 27 }, new int[] { 1, 1, 1 }); } + + public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception { + ShingleAnalyzerWrapper analyzer + = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT)); + analyzer.setOutputUnigrams(false); + analyzer.setOutputUnigramsIfNoShingles(true); + assertAnalyzesToReuse(analyzer, "please", + new String[] { "please" }, + new int[] { 0 }, + new int[] { 6 }, + new int[] { 1 }); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java index a0268d19210..7bef76a6647 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java @@ -73,6 +73,14 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { createToken("shingles", 33, 39), }; + public static final int[] UNIGRAM_ONLY_POSITION_INCREMENTS = new int[] { + 1, 1, 1, 1, 1, 1 + }; + + public static final String[] UNIGRAM_ONLY_TYPES = new String[] { + "word", "word", "word", "word", "word", "word" + }; + public static Token[] testTokenWithHoles; public static final Token[] BI_GRAM_TOKENS = new Token[] { @@ -1018,15 +1026,44 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { new int[]{1,0,1,0,1,0,1} ); } - + + public void testOutputUnigramsIfNoShinglesSingleTokenCase() throws IOException { + // Single token input with outputUnigrams==false is the primary case where + // enabling this option should alter program behavior. + this.shingleFilterTest(2, 2, TEST_SINGLE_TOKEN, SINGLE_TOKEN, + SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES, + false, true); + } + + public void testOutputUnigramsIfNoShinglesWithSimpleBigram() throws IOException { + // Here we expect the same result as with testBiGramFilter(). + this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS, + BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES, + true, true); + } + + public void testOutputUnigramsIfNoShinglesWithSimpleUnigramlessBigram() throws IOException { + // Here we expect the same result as with testBiGramFilterWithoutUnigrams(). + this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS, + BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS, + false, true); + } + + public void testOutputUnigramsIfNoShinglesWithMultipleInputTokens() throws IOException { + // Test when the minimum shingle size is greater than the number of input tokens + this.shingleFilterTest(7, 7, TEST_TOKEN, TEST_TOKEN, + UNIGRAM_ONLY_POSITION_INCREMENTS, UNIGRAM_ONLY_TYPES, + false, true); + } + protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, String[] types, boolean outputUnigrams) throws IOException { ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize); - shingleFilterTestCommon - (filter, tokensToCompare, positionIncrements, types, outputUnigrams); + filter.setOutputUnigrams(outputUnigrams); + shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); } protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, @@ -1035,8 +1072,20 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { throws IOException { ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize); - shingleFilterTestCommon - (filter, tokensToCompare, positionIncrements, types, outputUnigrams); + filter.setOutputUnigrams(outputUnigrams); + shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); + } + + protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, + Token[] tokensToCompare, int[] positionIncrements, + String[] types, boolean outputUnigrams, + boolean outputUnigramsIfNoShingles) + throws IOException { + ShingleFilter filter + = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize); + filter.setOutputUnigrams(outputUnigrams); + filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); + shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); } protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, @@ -1046,18 +1095,15 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize); filter.setTokenSeparator(tokenSeparator); - shingleFilterTestCommon - (filter, tokensToCompare, positionIncrements, types, outputUnigrams); + filter.setOutputUnigrams(outputUnigrams); + shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); } protected void shingleFilterTestCommon(ShingleFilter filter, Token[] tokensToCompare, int[] positionIncrements, - String[] types, boolean outputUnigrams) + String[] types) throws IOException { - - filter.setOutputUnigrams(outputUnigrams); - String text[] = new String[tokensToCompare.length]; int startOffsets[] = new int[tokensToCompare.length]; int endOffsets[] = new int[tokensToCompare.length];