LUCENE-1903: Fix incorrect ShingleFilter behavior when outputUnigrams == false

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@812779 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2009-09-09 06:02:54 +00:00
parent 3507752e42
commit a8eb5c4b80
3 changed files with 120 additions and 14 deletions

View File

@ -28,8 +28,10 @@ API Changes
* LUCENE-1460: Change contrib TokenStreams/Filters to use the new * LUCENE-1460: Change contrib TokenStreams/Filters to use the new
TokenStream API. (Robert Muir, Michael Busch) TokenStream API. (Robert Muir, Michael Busch)
* LUCENE-1775: Change remaining TokenFilters (shingle, prefix-suffix) to * LUCENE-1775, LUCENE-1903: Change remaining TokenFilters (shingle, prefix-suffix)
use the new TokenStream API. (Robert Muir, Michael Busch) to use the new TokenStream API. ShingleFilter is much more efficient now,
it clones much less often and computes the tokens mostly on the fly now.
Also added more tests. (Robert Muir, Michael Busch)
* LUCENE-1685: The position aware SpanScorer has become the default scorer * LUCENE-1685: The position aware SpanScorer has become the default scorer
for Highlighting. The SpanScorer implementation has replaced QueryScorer for Highlighting. The SpanScorer implementation has replaced QueryScorer

View File

@ -88,7 +88,6 @@ public class ShingleFilter extends TokenFilter {
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
} }
/** /**
@ -174,11 +173,15 @@ public class ShingleFilter extends TokenFilter {
nextToken = (AttributeSource.State) shingleBuf.getFirst(); nextToken = (AttributeSource.State) shingleBuf.getFirst();
if (shingleBufferPosition == 0 && (! shingleBuf.isEmpty()) && outputUnigrams) { if (outputUnigrams) {
restoreState(nextToken); if (shingleBufferPosition == 0) {
posIncrAtt.setPositionIncrement(1); restoreState(nextToken);
posIncrAtt.setPositionIncrement(1);
shingleBufferPosition++;
return true;
}
} else {
shingleBufferPosition++; shingleBufferPosition++;
return true;
} }
if (shingleBufferPosition < shingleBuf.size()) { if (shingleBufferPosition < shingleBuf.size()) {
@ -277,7 +280,7 @@ public class ShingleFilter extends TokenFilter {
shingleBuf.add(captureState()); shingleBuf.add(captureState());
if (shingleBuf.size() > maxShingleSize) if (shingleBuf.size() > maxShingleSize)
{ {
shingleBuf.remove(0); shingleBuf.removeFirst();
} }
addedToken = true; addedToken = true;
} else { } else {
@ -294,7 +297,7 @@ public class ShingleFilter extends TokenFilter {
* the end of the input stream and have to discard the least recent token. * the end of the input stream and have to discard the least recent token.
*/ */
if (! addedToken) { if (! addedToken) {
shingleBuf.remove(0); shingleBuf.removeFirst();
} }
if (shingleBuf.isEmpty()) { if (shingleBuf.isEmpty()) {

View File

@ -115,6 +115,60 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
}; };
public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
createToken("please divide", 0, 13),
createToken("divide this", 7, 18),
createToken("this sentence", 14, 27),
createToken("sentence into", 19, 32),
createToken("into shingles", 28, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
1, 1, 1, 1, 1
};
public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
"shingle", "shingle", "shingle", "shingle", "shingle"
};
public static final Token[] BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS = new Token[] {
createToken("please divide", 0, 13),
createToken("divide _", 7, 19),
createToken("_ sentence", 19, 27),
createToken("sentence _", 19, 33),
createToken("_ shingles", 33, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS = new int[] {
1, 1, 1, 1, 1, 1
};
public static final Token[] TEST_SINGLE_TOKEN = new Token[] {
createToken("please", 0, 6)
};
public static final Token[] SINGLE_TOKEN = new Token[] {
createToken("please", 0, 6)
};
public static final int[] SINGLE_TOKEN_INCREMENTS = new int[] {
1
};
public static final String[] SINGLE_TOKEN_TYPES = new String[] {
"word"
};
public static final Token[] EMPTY_TOKEN_ARRAY = new Token[] {
};
public static final int[] EMPTY_TOKEN_INCREMENTS_ARRAY = new int[] {
};
public static final String[] EMPTY_TOKEN_TYPES_ARRAY = new String[] {
};
public static final Token[] TRI_GRAM_TOKENS = new Token[] { public static final Token[] TRI_GRAM_TOKENS = new Token[] {
createToken("please", 0, 6), createToken("please", 0, 6),
createToken("please divide", 0, 13), createToken("please divide", 0, 13),
@ -165,19 +219,60 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
*/ */
public void testBiGramFilter() throws IOException { public void testBiGramFilter() throws IOException {
this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS, this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES); BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
true);
} }
public void testBiGramFilterWithHoles() throws IOException { public void testBiGramFilterWithHoles() throws IOException {
this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES, this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES); BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
true);
}
public void testBiGramFilterWithoutUnigrams() throws IOException {
this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
false);
}
public void testBiGramFilterWithHolesWithoutUnigrams() throws IOException {
this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS,
BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
false);
}
public void testBiGramFilterWithSingleToken() throws IOException {
this.shingleFilterTest(2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
true);
}
public void testBiGramFilterWithSingleTokenWithoutUnigrams() throws IOException {
this.shingleFilterTest(2, TEST_SINGLE_TOKEN, EMPTY_TOKEN_ARRAY,
EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
false);
}
public void testBiGramFilterWithEmptyTokenStream() throws IOException {
this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
true);
}
public void testBiGramFilterWithEmptyTokenStreamWithoutUnigrams() throws IOException {
this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
false);
} }
public void testTriGramFilter() throws IOException { public void testTriGramFilter() throws IOException {
this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS, this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES); TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
true);
} }
public void testReset() throws Exception { public void testReset() throws Exception {
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence")); Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
TokenStream filter = new ShingleFilter(wsTokenizer, 2); TokenStream filter = new ShingleFilter(wsTokenizer, 2);
@ -197,10 +292,13 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
} }
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
int[] positionIncrements, String[] types) int[] positionIncrements, String[] types,
boolean outputUnigrams)
throws IOException { throws IOException {
TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize); ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
filter.setOutputUnigrams(outputUnigrams);
TermAttribute termAtt = (TermAttribute) filter.addAttribute(TermAttribute.class); TermAttribute termAtt = (TermAttribute) filter.addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) filter.addAttribute(OffsetAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) filter.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) filter.addAttribute(PositionIncrementAttribute.class); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) filter.addAttribute(PositionIncrementAttribute.class);
@ -208,6 +306,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
int i = 0; int i = 0;
while (filter.incrementToken()) { while (filter.incrementToken()) {
assertTrue("ShingleFilter outputted more tokens than expected", i < tokensToCompare.length);
String termText = termAtt.term(); String termText = termAtt.term();
String goldText = tokensToCompare[i].term(); String goldText = tokensToCompare[i].term();
assertEquals("Wrong termText", goldText, termText); assertEquals("Wrong termText", goldText, termText);
@ -220,6 +319,8 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type()); assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
i++; i++;
} }
assertEquals("ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" + tokensToCompare.length + ")",
tokensToCompare.length, i);
} }
private static Token createToken(String term, int start, int offset) private static Token createToken(String term, int start, int offset)