mirror of https://github.com/apache/lucene.git
LUCENE-7708: Fix position length attribute set by the ShingleFilter when outputUnigrams=false
This commit is contained in:
parent
cab3aae11d
commit
57a42e4ec5
|
@ -155,6 +155,10 @@ Bug Fixes
|
|||
token graph, messing up phrase queries when it was used during query
|
||||
parsing (Ere Maijala via Mike McCandless)
|
||||
|
||||
* LUCENE-7708: ShingleFilter without unigram was producing a disconnected
|
||||
token graph, messing up queries when it was used during query
|
||||
parsing (Jim Ferenczi)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7055: Added Weight#scorerSupplier, which allows to estimate the cost
|
||||
|
|
|
@ -343,7 +343,12 @@ public final class ShingleFilter extends TokenFilter {
|
|||
noShingleOutput = false;
|
||||
}
|
||||
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
|
||||
posLenAtt.setPositionLength(builtGramSize);
|
||||
if (outputUnigrams) {
|
||||
posLenAtt.setPositionLength(builtGramSize);
|
||||
} else {
|
||||
// position length for this token is the number of position created by shingles of smaller size.
|
||||
posLenAtt.setPositionLength(Math.max(1, (builtGramSize - minShingleSize) + 1));
|
||||
}
|
||||
isOutputHere = true;
|
||||
gramSize.advance();
|
||||
tokenAvailable = true;
|
||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.*;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -1239,7 +1239,6 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
|
||||
filter.setFillerToken(null);
|
||||
filter.setTokenSeparator(null);
|
||||
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"},
|
||||
new int[] {0, 0, 0, 7, 7, 7},
|
||||
|
@ -1247,4 +1246,95 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
new int[] {1, 0, 0, 1, 0, 0},
|
||||
20);
|
||||
}
|
||||
|
||||
public void testPositionLength() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4);
|
||||
filter.setOutputUnigrams(false);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
assertTokenStreamContents(a.tokenStream("", "to be or not to be"),
|
||||
new String[] {"to be or not", "be or not to", "or not to be"},
|
||||
new int[] {0, 3, 6},
|
||||
new int[] {12, 15, 18},
|
||||
null,
|
||||
new int[] {1, 1, 1},
|
||||
new int[] {1, 1, 1},
|
||||
18,
|
||||
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
|
||||
// finishing at the same position
|
||||
false);
|
||||
|
||||
|
||||
a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4);
|
||||
filter.setOutputUnigrams(false);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
assertTokenStreamContents(a.tokenStream("", "to be or not to be"),
|
||||
new String[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to",
|
||||
"or not to be", "not to", "not to be", "to be"},
|
||||
new int[] {0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13},
|
||||
new int[] {5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18},
|
||||
null,
|
||||
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1},
|
||||
new int[] {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1},
|
||||
18,
|
||||
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
|
||||
// finishing at the same position
|
||||
false);
|
||||
|
||||
a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4);
|
||||
filter.setOutputUnigrams(false);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
|
||||
assertTokenStreamContents(a.tokenStream("", "to be or not to be"),
|
||||
new String[] {"to be or", "to be or not", "be or not", "be or not to", "or not to",
|
||||
"or not to be", "not to be"},
|
||||
new int[] {0, 0, 3, 3, 6, 6, 9},
|
||||
new int[] {8, 12, 12, 15, 15, 18, 18},
|
||||
null,
|
||||
new int[] {1, 0, 1, 0, 1, 0, 1, 0},
|
||||
new int[] {1, 2, 1, 2, 1, 2, 1, 2},
|
||||
18,
|
||||
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
|
||||
// finishing at the same position
|
||||
false);
|
||||
|
||||
a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5);
|
||||
filter.setOutputUnigrams(false);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
assertTokenStreamContents(a.tokenStream("", "to be or not to be"),
|
||||
new String[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to",
|
||||
"be or not to be", "or not to", "or not to be", "not to be"},
|
||||
new int[] {0, 0, 0, 3, 3, 3, 6, 6, 9, 9},
|
||||
new int[] {8, 12, 15, 12, 15, 18, 15, 18, 18},
|
||||
null,
|
||||
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0},
|
||||
new int[] {1, 2, 3, 1, 2, 3, 1, 2, 1},
|
||||
18,
|
||||
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
|
||||
// finishing at the same position
|
||||
false);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue