LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram was set to false

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@897672 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2010-01-10 18:06:19 +00:00
parent cfb822be6d
commit 673e368bf7
3 changed files with 111 additions and 4 deletions

View File

@ -18,6 +18,9 @@ Changes in runtime behavior
Bug fixes
* LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram
was set to false. (Simon Willnauer)
* LUCENE-2068: Fixed ReverseStringFilter which was not aware of supplementary
characters. During reverse the filter created unpaired surrogates, which
will be replaced by U+FFFD by the indexer, but not at query time. The filter

View File

@ -182,7 +182,7 @@ public final class ShingleFilter extends TokenFilter {
shingleBufferPosition++;
return true;
}
} else {
} else if (shingleBufferPosition % this.maxShingleSize == 0){
shingleBufferPosition++;
}
@ -197,7 +197,7 @@ public final class ShingleFilter extends TokenFilter {
termBuffer = termAtt.resizeTermBuffer(termLength);
buf.getChars(0, termLength, termBuffer, 0);
termAtt.setTermLength(termLength);
if ((! outputUnigrams) && shingleBufferPosition == 1) {
if ((! outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) {
posIncrAtt.setPositionIncrement(1);
} else {
posIncrAtt.setPositionIncrement(0);

View File

@ -200,6 +200,93 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
"word", "shingle",
"word"
};
public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
createToken("please divide", 0, 13),
createToken("please divide this", 0, 18),
createToken("divide this", 7, 18),
createToken("divide this sentence", 7, 27),
createToken("this sentence", 14, 27),
createToken("this sentence into", 14, 32),
createToken("sentence into", 19, 32),
createToken("sentence into shingles", 19, 39),
createToken("into shingles", 28, 39),
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
1, 0, 1, 0, 1, 0, 1, 0, 1
};
public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle",
};
public static final Token[] FOUR_GRAM_TOKENS = new Token[] {
createToken("please", 0, 6),
createToken("please divide", 0, 13),
createToken("please divide this", 0, 18),
createToken("please divide this sentence", 0, 27),
createToken("divide", 7, 13),
createToken("divide this", 7, 18),
createToken("divide this sentence", 7, 27),
createToken("divide this sentence into", 7, 32),
createToken("this", 14, 18),
createToken("this sentence", 14, 27),
createToken("this sentence into", 14, 32),
createToken("this sentence into shingles", 14, 39),
createToken("sentence", 19, 27),
createToken("sentence into", 19, 32),
createToken("sentence into shingles", 19, 39),
createToken("into", 28, 32),
createToken("into shingles", 28, 39),
createToken("shingles", 33, 39)
};
public static final int[] FOUR_GRAM_POSITION_INCREMENTS = new int[] {
1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1
};
public static final String[] FOUR_GRAM_TYPES = new String[] {
"word", "shingle", "shingle", "shingle",
"word", "shingle", "shingle", "shingle",
"word", "shingle", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle",
"word"
};
public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
createToken("please divide", 0, 13),
createToken("please divide this", 0, 18),
createToken("please divide this sentence", 0, 27),
createToken("divide this", 7, 18),
createToken("divide this sentence", 7, 27),
createToken("divide this sentence into", 7, 32),
createToken("this sentence", 14, 27),
createToken("this sentence into", 14, 32),
createToken("this sentence into shingles", 14, 39),
createToken("sentence into", 19, 32),
createToken("sentence into shingles", 19, 39),
createToken("into shingles", 28, 39),
};
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
};
public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
};
@Override
@ -272,8 +359,25 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
true);
}
public void testTriGramFilterWithoutUnigrams() throws IOException {
this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_WITHOUT_UNIGRAMS,
false);
}
public void testFourGramFilter() throws IOException {
this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS,
FOUR_GRAM_POSITION_INCREMENTS, FOUR_GRAM_TYPES,
true);
}
public void testFourGramFilterWithoutUnigrams() throws IOException {
this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS,
FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS, false);
}
public void testReset() throws Exception {
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));