mirror of https://github.com/apache/lucene.git
LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram was set to false
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@897672 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cfb822be6d
commit
673e368bf7
|
@ -18,6 +18,9 @@ Changes in runtime behavior
|
|||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram
|
||||
was set to false. (Simon Willnauer)
|
||||
|
||||
* LUCENE-2068: Fixed ReverseStringFilter which was not aware of supplementary
|
||||
characters. During reverse the filter created unpaired surrogates, which
|
||||
will be replaced by U+FFFD by the indexer, but not at query time. The filter
|
||||
|
|
|
@ -182,7 +182,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
shingleBufferPosition++;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
} else if (shingleBufferPosition % this.maxShingleSize == 0){
|
||||
shingleBufferPosition++;
|
||||
}
|
||||
|
||||
|
@ -197,7 +197,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
termBuffer = termAtt.resizeTermBuffer(termLength);
|
||||
buf.getChars(0, termLength, termBuffer, 0);
|
||||
termAtt.setTermLength(termLength);
|
||||
if ((! outputUnigrams) && shingleBufferPosition == 1) {
|
||||
if ((! outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) {
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
} else {
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
|
|
|
@ -200,6 +200,93 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
"word", "shingle",
|
||||
"word"
|
||||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
|
||||
createToken("please divide", 0, 13),
|
||||
createToken("please divide this", 0, 18),
|
||||
createToken("divide this", 7, 18),
|
||||
createToken("divide this sentence", 7, 27),
|
||||
createToken("this sentence", 14, 27),
|
||||
createToken("this sentence into", 14, 32),
|
||||
createToken("sentence into", 19, 32),
|
||||
createToken("sentence into shingles", 19, 39),
|
||||
createToken("into shingles", 28, 39),
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle",
|
||||
};
|
||||
|
||||
public static final Token[] FOUR_GRAM_TOKENS = new Token[] {
|
||||
createToken("please", 0, 6),
|
||||
createToken("please divide", 0, 13),
|
||||
createToken("please divide this", 0, 18),
|
||||
createToken("please divide this sentence", 0, 27),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("divide this", 7, 18),
|
||||
createToken("divide this sentence", 7, 27),
|
||||
createToken("divide this sentence into", 7, 32),
|
||||
createToken("this", 14, 18),
|
||||
createToken("this sentence", 14, 27),
|
||||
createToken("this sentence into", 14, 32),
|
||||
createToken("this sentence into shingles", 14, 39),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("sentence into", 19, 32),
|
||||
createToken("sentence into shingles", 19, 39),
|
||||
createToken("into", 28, 32),
|
||||
createToken("into shingles", 28, 39),
|
||||
createToken("shingles", 33, 39)
|
||||
};
|
||||
|
||||
public static final int[] FOUR_GRAM_POSITION_INCREMENTS = new int[] {
|
||||
1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] FOUR_GRAM_TYPES = new String[] {
|
||||
"word", "shingle", "shingle", "shingle",
|
||||
"word", "shingle", "shingle", "shingle",
|
||||
"word", "shingle", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle",
|
||||
"word"
|
||||
};
|
||||
|
||||
public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
|
||||
createToken("please divide", 0, 13),
|
||||
createToken("please divide this", 0, 18),
|
||||
createToken("please divide this sentence", 0, 27),
|
||||
createToken("divide this", 7, 18),
|
||||
createToken("divide this sentence", 7, 27),
|
||||
createToken("divide this sentence into", 7, 32),
|
||||
createToken("this sentence", 14, 27),
|
||||
createToken("this sentence into", 14, 32),
|
||||
createToken("this sentence into shingles", 14, 39),
|
||||
createToken("sentence into", 19, 32),
|
||||
createToken("sentence into shingles", 19, 39),
|
||||
createToken("into shingles", 28, 39),
|
||||
};
|
||||
|
||||
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
|
||||
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
"shingle", "shingle",
|
||||
|
||||
};
|
||||
|
||||
|
||||
@Override
|
||||
|
@ -272,8 +359,25 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
|
||||
true);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void testTriGramFilterWithoutUnigrams() throws IOException {
|
||||
this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
|
||||
TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_WITHOUT_UNIGRAMS,
|
||||
false);
|
||||
}
|
||||
|
||||
public void testFourGramFilter() throws IOException {
|
||||
this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS,
|
||||
FOUR_GRAM_POSITION_INCREMENTS, FOUR_GRAM_TYPES,
|
||||
true);
|
||||
}
|
||||
|
||||
public void testFourGramFilterWithoutUnigrams() throws IOException {
|
||||
this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS,
|
||||
FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
|
||||
FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS, false);
|
||||
}
|
||||
|
||||
|
||||
public void testReset() throws Exception {
|
||||
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
|
||||
|
|
Loading…
Reference in New Issue