mirror of https://github.com/apache/lucene.git
LUCENE-1903: Fix incorrect ShingleFilter behavior when outputUnigrams == false
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@812779 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3507752e42
commit
a8eb5c4b80
|
@ -28,8 +28,10 @@ API Changes
|
||||||
* LUCENE-1460: Change contrib TokenStreams/Filters to use the new
|
* LUCENE-1460: Change contrib TokenStreams/Filters to use the new
|
||||||
TokenStream API. (Robert Muir, Michael Busch)
|
TokenStream API. (Robert Muir, Michael Busch)
|
||||||
|
|
||||||
* LUCENE-1775: Change remaining TokenFilters (shingle, prefix-suffix) to
|
* LUCENE-1775, LUCENE-1903: Change remaining TokenFilters (shingle, prefix-suffix)
|
||||||
use the new TokenStream API. (Robert Muir, Michael Busch)
|
to use the new TokenStream API. ShingleFilter is much more efficient now,
|
||||||
|
it clones much less often and computes the tokens mostly on the fly now.
|
||||||
|
Also added more tests. (Robert Muir, Michael Busch)
|
||||||
|
|
||||||
* LUCENE-1685: The position aware SpanScorer has become the default scorer
|
* LUCENE-1685: The position aware SpanScorer has become the default scorer
|
||||||
for Highlighting. The SpanScorer implementation has replaced QueryScorer
|
for Highlighting. The SpanScorer implementation has replaced QueryScorer
|
||||||
|
|
|
@ -88,7 +88,6 @@ public class ShingleFilter extends TokenFilter {
|
||||||
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -174,11 +173,15 @@ public class ShingleFilter extends TokenFilter {
|
||||||
|
|
||||||
nextToken = (AttributeSource.State) shingleBuf.getFirst();
|
nextToken = (AttributeSource.State) shingleBuf.getFirst();
|
||||||
|
|
||||||
if (shingleBufferPosition == 0 && (! shingleBuf.isEmpty()) && outputUnigrams) {
|
if (outputUnigrams) {
|
||||||
restoreState(nextToken);
|
if (shingleBufferPosition == 0) {
|
||||||
posIncrAtt.setPositionIncrement(1);
|
restoreState(nextToken);
|
||||||
|
posIncrAtt.setPositionIncrement(1);
|
||||||
|
shingleBufferPosition++;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
shingleBufferPosition++;
|
shingleBufferPosition++;
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (shingleBufferPosition < shingleBuf.size()) {
|
if (shingleBufferPosition < shingleBuf.size()) {
|
||||||
|
@ -277,7 +280,7 @@ public class ShingleFilter extends TokenFilter {
|
||||||
shingleBuf.add(captureState());
|
shingleBuf.add(captureState());
|
||||||
if (shingleBuf.size() > maxShingleSize)
|
if (shingleBuf.size() > maxShingleSize)
|
||||||
{
|
{
|
||||||
shingleBuf.remove(0);
|
shingleBuf.removeFirst();
|
||||||
}
|
}
|
||||||
addedToken = true;
|
addedToken = true;
|
||||||
} else {
|
} else {
|
||||||
|
@ -294,7 +297,7 @@ public class ShingleFilter extends TokenFilter {
|
||||||
* the end of the input stream and have to discard the least recent token.
|
* the end of the input stream and have to discard the least recent token.
|
||||||
*/
|
*/
|
||||||
if (! addedToken) {
|
if (! addedToken) {
|
||||||
shingleBuf.remove(0);
|
shingleBuf.removeFirst();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (shingleBuf.isEmpty()) {
|
if (shingleBuf.isEmpty()) {
|
||||||
|
|
|
@ -115,6 +115,60 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
|
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
|
||||||
};
|
};
|
||||||
|
|
||||||
|
public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
|
||||||
|
createToken("please divide", 0, 13),
|
||||||
|
createToken("divide this", 7, 18),
|
||||||
|
createToken("this sentence", 14, 27),
|
||||||
|
createToken("sentence into", 19, 32),
|
||||||
|
createToken("into shingles", 28, 39),
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
|
||||||
|
1, 1, 1, 1, 1
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
|
||||||
|
"shingle", "shingle", "shingle", "shingle", "shingle"
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final Token[] BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS = new Token[] {
|
||||||
|
createToken("please divide", 0, 13),
|
||||||
|
createToken("divide _", 7, 19),
|
||||||
|
createToken("_ sentence", 19, 27),
|
||||||
|
createToken("sentence _", 19, 33),
|
||||||
|
createToken("_ shingles", 33, 39),
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS = new int[] {
|
||||||
|
1, 1, 1, 1, 1, 1
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
public static final Token[] TEST_SINGLE_TOKEN = new Token[] {
|
||||||
|
createToken("please", 0, 6)
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final Token[] SINGLE_TOKEN = new Token[] {
|
||||||
|
createToken("please", 0, 6)
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final int[] SINGLE_TOKEN_INCREMENTS = new int[] {
|
||||||
|
1
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final String[] SINGLE_TOKEN_TYPES = new String[] {
|
||||||
|
"word"
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final Token[] EMPTY_TOKEN_ARRAY = new Token[] {
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final int[] EMPTY_TOKEN_INCREMENTS_ARRAY = new int[] {
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final String[] EMPTY_TOKEN_TYPES_ARRAY = new String[] {
|
||||||
|
};
|
||||||
|
|
||||||
public static final Token[] TRI_GRAM_TOKENS = new Token[] {
|
public static final Token[] TRI_GRAM_TOKENS = new Token[] {
|
||||||
createToken("please", 0, 6),
|
createToken("please", 0, 6),
|
||||||
createToken("please divide", 0, 13),
|
createToken("please divide", 0, 13),
|
||||||
|
@ -165,19 +219,60 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||||
*/
|
*/
|
||||||
public void testBiGramFilter() throws IOException {
|
public void testBiGramFilter() throws IOException {
|
||||||
this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
|
this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
|
||||||
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
|
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
|
||||||
|
true);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBiGramFilterWithHoles() throws IOException {
|
public void testBiGramFilterWithHoles() throws IOException {
|
||||||
this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
|
this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
|
||||||
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
|
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
|
||||||
|
true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBiGramFilterWithoutUnigrams() throws IOException {
|
||||||
|
this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
|
||||||
|
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBiGramFilterWithHolesWithoutUnigrams() throws IOException {
|
||||||
|
this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS,
|
||||||
|
BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBiGramFilterWithSingleToken() throws IOException {
|
||||||
|
this.shingleFilterTest(2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
|
||||||
|
SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
|
||||||
|
true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBiGramFilterWithSingleTokenWithoutUnigrams() throws IOException {
|
||||||
|
this.shingleFilterTest(2, TEST_SINGLE_TOKEN, EMPTY_TOKEN_ARRAY,
|
||||||
|
EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBiGramFilterWithEmptyTokenStream() throws IOException {
|
||||||
|
this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
|
||||||
|
EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
|
||||||
|
true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBiGramFilterWithEmptyTokenStreamWithoutUnigrams() throws IOException {
|
||||||
|
this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
|
||||||
|
EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
|
||||||
|
false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testTriGramFilter() throws IOException {
|
public void testTriGramFilter() throws IOException {
|
||||||
this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
|
this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
|
||||||
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
|
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
|
||||||
|
true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
|
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
|
||||||
TokenStream filter = new ShingleFilter(wsTokenizer, 2);
|
TokenStream filter = new ShingleFilter(wsTokenizer, 2);
|
||||||
|
@ -197,10 +292,13 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
|
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
|
||||||
int[] positionIncrements, String[] types)
|
int[] positionIncrements, String[] types,
|
||||||
|
boolean outputUnigrams)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
|
ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
|
||||||
|
filter.setOutputUnigrams(outputUnigrams);
|
||||||
|
|
||||||
TermAttribute termAtt = (TermAttribute) filter.addAttribute(TermAttribute.class);
|
TermAttribute termAtt = (TermAttribute) filter.addAttribute(TermAttribute.class);
|
||||||
OffsetAttribute offsetAtt = (OffsetAttribute) filter.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = (OffsetAttribute) filter.addAttribute(OffsetAttribute.class);
|
||||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) filter.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) filter.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
@ -208,6 +306,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while (filter.incrementToken()) {
|
while (filter.incrementToken()) {
|
||||||
|
assertTrue("ShingleFilter outputted more tokens than expected", i < tokensToCompare.length);
|
||||||
String termText = termAtt.term();
|
String termText = termAtt.term();
|
||||||
String goldText = tokensToCompare[i].term();
|
String goldText = tokensToCompare[i].term();
|
||||||
assertEquals("Wrong termText", goldText, termText);
|
assertEquals("Wrong termText", goldText, termText);
|
||||||
|
@ -220,6 +319,8 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||||
assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
|
assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
assertEquals("ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" + tokensToCompare.length + ")",
|
||||||
|
tokensToCompare.length, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Token createToken(String term, int start, int offset)
|
private static Token createToken(String term, int start, int offset)
|
||||||
|
|
Loading…
Reference in New Issue