LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles can be generated.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1006187 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2010-10-09 16:55:23 +00:00
parent 2e6787a850
commit f9e4f551e2
5 changed files with 127 additions and 11 deletions

View File

@ -15,6 +15,9 @@ API Changes
RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
can be generated. (Chris Harris via Steven Rowe)
New Features
* LUCENE-2413: Consolidated Solr analysis components into common.

View File

@ -38,6 +38,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
private boolean outputUnigrams = true;
private boolean outputUnigramsIfNoShingles = false;
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
super();
@ -147,6 +148,24 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
public void setOutputUnigrams(boolean outputUnigrams) {
this.outputUnigrams = outputUnigrams;
}
public boolean isOutputUnigramsIfNoShingles() {
return outputUnigramsIfNoShingles;
}
/**
* <p>Shall we override the behavior of outputUnigrams==false for those
* times when no shingles are available (because there are fewer than
* minShingleSize tokens in the input stream)? (default: false.)
* <p>Note that if outputUnigrams==true, then unigrams are always output,
* regardless of whether any shingles are available.
*
* @param outputUnigramsIfNoShingles Whether or not to output a single
* unigram when no shingles are available.
*/
public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
@ -161,6 +180,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
filter.setMaxShingleSize(maxShingleSize);
filter.setTokenSeparator(tokenSeparator);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
return filter;
}
@ -192,6 +212,7 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
streams.shingle.setMinShingleSize(minShingleSize);
streams.shingle.setTokenSeparator(tokenSeparator);
streams.shingle.setOutputUnigrams(outputUnigrams);
streams.shingle.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
return streams.shingle;
}
}

View File

@ -102,6 +102,11 @@ public final class ShingleFilter extends TokenFilter {
*/
private boolean outputUnigrams = true;
/**
* By default, we don't override behavior of outputUnigrams.
*/
private boolean outputUnigramsIfNoShingles = false;
/**
* maximum shingle size (number of tokens)
*/
@ -136,6 +141,11 @@ public final class ShingleFilter extends TokenFilter {
* position.
*/
private boolean isOutputHere = false;
/**
* true if no shingles have been output yet (for outputUnigramsIfNoShingles).
*/
boolean noShingleOutput = true;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@ -211,6 +221,20 @@ public final class ShingleFilter extends TokenFilter {
gramSize = new CircularSequence();
}
/**
* <p>Shall we override the behavior of outputUnigrams==false for those
* times when no shingles are available (because there are fewer than
* minShingleSize tokens in the input stream)? (default: false.)
* <p>Note that if outputUnigrams==true, then unigrams are always output,
* regardless of whether any shingles are available.
*
* @param outputUnigramsIfNoShingles Whether or not to output a single
* unigram when no shingles are available.
*/
public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
}
/**
* Set the max shingle size (default: 2)
*
@ -292,6 +316,7 @@ public final class ShingleFilter extends TokenFilter {
termAtt.setEmpty().append(gramBuilder);
if (gramSize.getValue() > 1) {
typeAtt.setType(tokenType);
noShingleOutput = false;
}
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
isOutputHere = true;
@ -395,6 +420,10 @@ public final class ShingleFilter extends TokenFilter {
}
}
}
if (outputUnigramsIfNoShingles && noShingleOutput
&& gramSize.minValue > 1 && inputWindow.size() < minShingleSize) {
gramSize.minValue = 1;
}
gramSize.reset();
isOutputHere = false;
}
@ -406,6 +435,11 @@ public final class ShingleFilter extends TokenFilter {
inputWindow.clear();
numFillerTokensToInsert = 0;
isOutputHere = false;
noShingleOutput = true;
if (outputUnigramsIfNoShingles && ! outputUnigrams) {
// Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
gramSize.minValue = minShingleSize;
}
}

View File

@ -359,4 +359,16 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 });
}
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
analyzer.setOutputUnigrams(false);
analyzer.setOutputUnigramsIfNoShingles(true);
assertAnalyzesToReuse(analyzer, "please",
new String[] { "please" },
new int[] { 0 },
new int[] { 6 },
new int[] { 1 });
}
}

View File

@ -73,6 +73,14 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
createToken("shingles", 33, 39),
};
public static final int[] UNIGRAM_ONLY_POSITION_INCREMENTS = new int[] {
1, 1, 1, 1, 1, 1
};
public static final String[] UNIGRAM_ONLY_TYPES = new String[] {
"word", "word", "word", "word", "word", "word"
};
public static Token[] testTokenWithHoles;
public static final Token[] BI_GRAM_TOKENS = new Token[] {
@ -1018,15 +1026,44 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
new int[]{1,0,1,0,1,0,1}
);
}
public void testOutputUnigramsIfNoShinglesSingleTokenCase() throws IOException {
// Single token input with outputUnigrams==false is the primary case where
// enabling this option should alter program behavior.
this.shingleFilterTest(2, 2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
false, true);
}
public void testOutputUnigramsIfNoShinglesWithSimpleBigram() throws IOException {
// Here we expect the same result as with testBiGramFilter().
this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS,
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
true, true);
}
public void testOutputUnigramsIfNoShinglesWithSimpleUnigramlessBigram() throws IOException {
// Here we expect the same result as with testBiGramFilterWithoutUnigrams().
this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
false, true);
}
public void testOutputUnigramsIfNoShinglesWithMultipleInputTokens() throws IOException {
// Test when the minimum shingle size is greater than the number of input tokens
this.shingleFilterTest(7, 7, TEST_TOKEN, TEST_TOKEN,
UNIGRAM_ONLY_POSITION_INCREMENTS, UNIGRAM_ONLY_TYPES,
false, true);
}
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
int[] positionIncrements, String[] types,
boolean outputUnigrams)
throws IOException {
ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
shingleFilterTestCommon
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
filter.setOutputUnigrams(outputUnigrams);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
@ -1035,8 +1072,20 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
throws IOException {
ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
shingleFilterTestCommon
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
filter.setOutputUnigrams(outputUnigrams);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
Token[] tokensToCompare, int[] positionIncrements,
String[] types, boolean outputUnigrams,
boolean outputUnigramsIfNoShingles)
throws IOException {
ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle,
@ -1046,18 +1095,15 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
ShingleFilter filter
= new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
filter.setTokenSeparator(tokenSeparator);
shingleFilterTestCommon
(filter, tokensToCompare, positionIncrements, types, outputUnigrams);
filter.setOutputUnigrams(outputUnigrams);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTestCommon(ShingleFilter filter,
Token[] tokensToCompare,
int[] positionIncrements,
String[] types, boolean outputUnigrams)
String[] types)
throws IOException {
filter.setOutputUnigrams(outputUnigrams);
String text[] = new String[tokensToCompare.length];
int startOffsets[] = new int[tokensToCompare.length];
int endOffsets[] = new int[tokensToCompare.length];