LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles can be generated.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1006187 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2010-10-09 16:55:23 +00:00
parent 2e6787a850
commit f9e4f551e2
5 changed files with 127 additions and 11 deletions

View File

@ -15,6 +15,9 @@ API Changes
RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
behavior. (Steven Rowe, Robert Muir, Uwe Schindler) behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
can be generated. (Chris Harris via Steven Rowe)
New Features New Features
* LUCENE-2413: Consolidated Solr analysis components into common. * LUCENE-2413: Consolidated Solr analysis components into common.

View File

@ -38,6 +38,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE; private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR; private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
private boolean outputUnigrams = true; private boolean outputUnigrams = true;
private boolean outputUnigramsIfNoShingles = false;
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) { public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
super(); super();
@ -148,6 +149,24 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
this.outputUnigrams = outputUnigrams; this.outputUnigrams = outputUnigrams;
} }
public boolean isOutputUnigramsIfNoShingles() {
return outputUnigramsIfNoShingles;
}
/**
* <p>Shall we override the behavior of outputUnigrams==false for those
* times when no shingles are available (because there are fewer than
* minShingleSize tokens in the input stream)? (default: false.)
* <p>Note that if outputUnigrams==true, then unigrams are always output,
* regardless of whether any shingles are available.
*
* @param outputUnigramsIfNoShingles Whether or not to output a single
* unigram when no shingles are available.
*/
public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
}
@Override @Override
public TokenStream tokenStream(String fieldName, Reader reader) { public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream wrapped; TokenStream wrapped;
@ -161,6 +180,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
filter.setMaxShingleSize(maxShingleSize); filter.setMaxShingleSize(maxShingleSize);
filter.setTokenSeparator(tokenSeparator); filter.setTokenSeparator(tokenSeparator);
filter.setOutputUnigrams(outputUnigrams); filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
return filter; return filter;
} }
@ -192,6 +212,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
streams.shingle.setMinShingleSize(minShingleSize); streams.shingle.setMinShingleSize(minShingleSize);
streams.shingle.setTokenSeparator(tokenSeparator); streams.shingle.setTokenSeparator(tokenSeparator);
streams.shingle.setOutputUnigrams(outputUnigrams); streams.shingle.setOutputUnigrams(outputUnigrams);
streams.shingle.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
return streams.shingle; return streams.shingle;
} }
} }

View File

@ -102,6 +102,11 @@ public final class ShingleFilter extends TokenFilter {
*/ */
private boolean outputUnigrams = true; private boolean outputUnigrams = true;
/**
* By default, we don't override behavior of outputUnigrams.
*/
private boolean outputUnigramsIfNoShingles = false;
/** /**
* maximum shingle size (number of tokens) * maximum shingle size (number of tokens)
*/ */
@ -137,6 +142,11 @@ public final class ShingleFilter extends TokenFilter {
*/ */
private boolean isOutputHere = false; private boolean isOutputHere = false;
/**
* true if no shingles have been output yet (for outputUnigramsIfNoShingles).
*/
boolean noShingleOutput = true;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
@ -211,6 +221,20 @@ public final class ShingleFilter extends TokenFilter {
gramSize = new CircularSequence(); gramSize = new CircularSequence();
} }
/**
* <p>Shall we override the behavior of outputUnigrams==false for those
* times when no shingles are available (because there are fewer than
* minShingleSize tokens in the input stream)? (default: false.)
* <p>Note that if outputUnigrams==true, then unigrams are always output,
* regardless of whether any shingles are available.
*
* @param outputUnigramsIfNoShingles Whether or not to output a single
* unigram when no shingles are available.
*/
public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
}
/** /**
* Set the max shingle size (default: 2) * Set the max shingle size (default: 2)
* *
@ -292,6 +316,7 @@ public final class ShingleFilter extends TokenFilter {
termAtt.setEmpty().append(gramBuilder); termAtt.setEmpty().append(gramBuilder);
if (gramSize.getValue() > 1) { if (gramSize.getValue() > 1) {
typeAtt.setType(tokenType); typeAtt.setType(tokenType);
noShingleOutput = false;
} }
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
isOutputHere = true; isOutputHere = true;
@ -395,6 +420,10 @@ public final class ShingleFilter extends TokenFilter {
} }
} }
} }
if (outputUnigramsIfNoShingles && noShingleOutput
&& gramSize.minValue > 1 && inputWindow.size() < minShingleSize) {
gramSize.minValue = 1;
}
gramSize.reset(); gramSize.reset();
isOutputHere = false; isOutputHere = false;
} }
@ -406,6 +435,11 @@ public final class ShingleFilter extends TokenFilter {
inputWindow.clear(); inputWindow.clear();
numFillerTokensToInsert = 0; numFillerTokensToInsert = 0;
isOutputHere = false; isOutputHere = false;
noShingleOutput = true;
if (outputUnigramsIfNoShingles && ! outputUnigrams) {
// Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
gramSize.minValue = minShingleSize;
}
} }

View File

@ -359,4 +359,16 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 13, 18, 27 }, new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 }); new int[] { 1, 1, 1 });
} }
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
analyzer.setOutputUnigrams(false);
analyzer.setOutputUnigramsIfNoShingles(true);
assertAnalyzesToReuse(analyzer, "please",
new String[] { "please" },
new int[] { 0 },
new int[] { 6 },
new int[] { 1 });
}
} }

View File

@ -73,6 +73,14 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
createToken("shingles", 33, 39), createToken("shingles", 33, 39),
}; };
public static final int[] UNIGRAM_ONLY_POSITION_INCREMENTS = new int[] {
1, 1, 1, 1, 1, 1
};
public static final String[] UNIGRAM_ONLY_TYPES = new String[] {
"word", "word", "word", "word", "word", "word"
};
public static Token[] testTokenWithHoles; public static Token[] testTokenWithHoles;
public static final Token[] BI_GRAM_TOKENS = new Token[] { public static final Token[] BI_GRAM_TOKENS = new Token[] {
@ -1019,14 +1027,43 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
); );
} }
public void testOutputUnigramsIfNoShinglesSingleTokenCase() throws IOException {
// Single token input with outputUnigrams==false is the primary case where
// enabling this option should alter program behavior.
this.shingleFilterTest(2, 2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
false, true);
}
public void testOutputUnigramsIfNoShinglesWithSimpleBigram() throws IOException {
// Here we expect the same result as with testBiGramFilter().
this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS,
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
true, true);
}
public void testOutputUnigramsIfNoShinglesWithSimpleUnigramlessBigram() throws IOException {
// Here we expect the same result as with testBiGramFilterWithoutUnigrams().
this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
false, true);
}
public void testOutputUnigramsIfNoShinglesWithMultipleInputTokens() throws IOException {
// Test when the minimum shingle size is greater than the number of input tokens
this.shingleFilterTest(7, 7, TEST_TOKEN, TEST_TOKEN,
UNIGRAM_ONLY_POSITION_INCREMENTS, UNIGRAM_ONLY_TYPES,
false, true);
}
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
int[] positionIncrements, String[] types, int[] positionIncrements, String[] types,
boolean outputUnigrams) boolean outputUnigrams)
throws IOException { throws IOException {
ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize); ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
shingleFilterTestCommon filter.setOutputUnigrams(outputUnigrams);
(filter, tokensToCompare, positionIncrements, types, outputUnigrams); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
} }
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
@ -1035,8 +1072,20 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
throws IOException { throws IOException {
ShingleFilter filter ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize); = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
shingleFilterTestCommon filter.setOutputUnigrams(outputUnigrams);
(filter, tokensToCompare, positionIncrements, types, outputUnigrams); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
Token[] tokensToCompare, int[] positionIncrements,
String[] types, boolean outputUnigrams,
boolean outputUnigramsIfNoShingles)
throws IOException {
ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
} }
protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle,
@ -1046,18 +1095,15 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
ShingleFilter filter ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize); = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
filter.setTokenSeparator(tokenSeparator); filter.setTokenSeparator(tokenSeparator);
shingleFilterTestCommon filter.setOutputUnigrams(outputUnigrams);
(filter, tokensToCompare, positionIncrements, types, outputUnigrams); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
} }
protected void shingleFilterTestCommon(ShingleFilter filter, protected void shingleFilterTestCommon(ShingleFilter filter,
Token[] tokensToCompare, Token[] tokensToCompare,
int[] positionIncrements, int[] positionIncrements,
String[] types, boolean outputUnigrams) String[] types)
throws IOException { throws IOException {
filter.setOutputUnigrams(outputUnigrams);
String text[] = new String[tokensToCompare.length]; String text[] = new String[tokensToCompare.length];
int startOffsets[] = new int[tokensToCompare.length]; int startOffsets[] = new int[tokensToCompare.length];
int endOffsets[] = new int[tokensToCompare.length]; int endOffsets[] = new int[tokensToCompare.length];