mirror of
https://github.com/apache/lucene.git
synced 2025-02-27 21:09:19 +00:00
LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram was set to false
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@897672 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cfb822be6d
commit
673e368bf7
@ -18,6 +18,9 @@ Changes in runtime behavior
|
|||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
|
* LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram
|
||||||
|
was set to false. (Simon Willnauer)
|
||||||
|
|
||||||
* LUCENE-2068: Fixed ReverseStringFilter which was not aware of supplementary
|
* LUCENE-2068: Fixed ReverseStringFilter which was not aware of supplementary
|
||||||
characters. During reverse the filter created unpaired surrogates, which
|
characters. During reverse the filter created unpaired surrogates, which
|
||||||
will be replaced by U+FFFD by the indexer, but not at query time. The filter
|
will be replaced by U+FFFD by the indexer, but not at query time. The filter
|
||||||
|
@ -182,7 +182,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||||||
shingleBufferPosition++;
|
shingleBufferPosition++;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} else {
|
} else if (shingleBufferPosition % this.maxShingleSize == 0){
|
||||||
shingleBufferPosition++;
|
shingleBufferPosition++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -197,7 +197,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||||||
termBuffer = termAtt.resizeTermBuffer(termLength);
|
termBuffer = termAtt.resizeTermBuffer(termLength);
|
||||||
buf.getChars(0, termLength, termBuffer, 0);
|
buf.getChars(0, termLength, termBuffer, 0);
|
||||||
termAtt.setTermLength(termLength);
|
termAtt.setTermLength(termLength);
|
||||||
if ((! outputUnigrams) && shingleBufferPosition == 1) {
|
if ((! outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) {
|
||||||
posIncrAtt.setPositionIncrement(1);
|
posIncrAtt.setPositionIncrement(1);
|
||||||
} else {
|
} else {
|
||||||
posIncrAtt.setPositionIncrement(0);
|
posIncrAtt.setPositionIncrement(0);
|
||||||
|
@ -200,6 +200,93 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||||||
"word", "shingle",
|
"word", "shingle",
|
||||||
"word"
|
"word"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
|
||||||
|
createToken("please divide", 0, 13),
|
||||||
|
createToken("please divide this", 0, 18),
|
||||||
|
createToken("divide this", 7, 18),
|
||||||
|
createToken("divide this sentence", 7, 27),
|
||||||
|
createToken("this sentence", 14, 27),
|
||||||
|
createToken("this sentence into", 14, 32),
|
||||||
|
createToken("sentence into", 19, 32),
|
||||||
|
createToken("sentence into shingles", 19, 39),
|
||||||
|
createToken("into shingles", 28, 39),
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
|
||||||
|
1, 0, 1, 0, 1, 0, 1, 0, 1
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
|
||||||
|
"shingle", "shingle",
|
||||||
|
"shingle", "shingle",
|
||||||
|
"shingle", "shingle",
|
||||||
|
"shingle", "shingle",
|
||||||
|
"shingle",
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final Token[] FOUR_GRAM_TOKENS = new Token[] {
|
||||||
|
createToken("please", 0, 6),
|
||||||
|
createToken("please divide", 0, 13),
|
||||||
|
createToken("please divide this", 0, 18),
|
||||||
|
createToken("please divide this sentence", 0, 27),
|
||||||
|
createToken("divide", 7, 13),
|
||||||
|
createToken("divide this", 7, 18),
|
||||||
|
createToken("divide this sentence", 7, 27),
|
||||||
|
createToken("divide this sentence into", 7, 32),
|
||||||
|
createToken("this", 14, 18),
|
||||||
|
createToken("this sentence", 14, 27),
|
||||||
|
createToken("this sentence into", 14, 32),
|
||||||
|
createToken("this sentence into shingles", 14, 39),
|
||||||
|
createToken("sentence", 19, 27),
|
||||||
|
createToken("sentence into", 19, 32),
|
||||||
|
createToken("sentence into shingles", 19, 39),
|
||||||
|
createToken("into", 28, 32),
|
||||||
|
createToken("into shingles", 28, 39),
|
||||||
|
createToken("shingles", 33, 39)
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final int[] FOUR_GRAM_POSITION_INCREMENTS = new int[] {
|
||||||
|
1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final String[] FOUR_GRAM_TYPES = new String[] {
|
||||||
|
"word", "shingle", "shingle", "shingle",
|
||||||
|
"word", "shingle", "shingle", "shingle",
|
||||||
|
"word", "shingle", "shingle", "shingle",
|
||||||
|
"word", "shingle", "shingle",
|
||||||
|
"word", "shingle",
|
||||||
|
"word"
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
|
||||||
|
createToken("please divide", 0, 13),
|
||||||
|
createToken("please divide this", 0, 18),
|
||||||
|
createToken("please divide this sentence", 0, 27),
|
||||||
|
createToken("divide this", 7, 18),
|
||||||
|
createToken("divide this sentence", 7, 27),
|
||||||
|
createToken("divide this sentence into", 7, 32),
|
||||||
|
createToken("this sentence", 14, 27),
|
||||||
|
createToken("this sentence into", 14, 32),
|
||||||
|
createToken("this sentence into shingles", 14, 39),
|
||||||
|
createToken("sentence into", 19, 32),
|
||||||
|
createToken("sentence into shingles", 19, 39),
|
||||||
|
createToken("into shingles", 28, 39),
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
|
||||||
|
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
|
||||||
|
"shingle", "shingle",
|
||||||
|
"shingle", "shingle",
|
||||||
|
"shingle", "shingle",
|
||||||
|
"shingle", "shingle",
|
||||||
|
"shingle", "shingle",
|
||||||
|
"shingle", "shingle",
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -272,8 +359,25 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||||||
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
|
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
|
||||||
true);
|
true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testTriGramFilterWithoutUnigrams() throws IOException {
|
||||||
|
this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
|
||||||
|
TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_WITHOUT_UNIGRAMS,
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFourGramFilter() throws IOException {
|
||||||
|
this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS,
|
||||||
|
FOUR_GRAM_POSITION_INCREMENTS, FOUR_GRAM_TYPES,
|
||||||
|
true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFourGramFilterWithoutUnigrams() throws IOException {
|
||||||
|
this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS,
|
||||||
|
FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
|
||||||
|
FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
|
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user