LUCENE-8719: Traverse all paths at the end of a TokenStream in FixedShingleFilter

This commit is contained in:
Alan Woodward 2019-03-14 11:10:02 +00:00
parent 003edafdc8
commit 6571d06c25
3 changed files with 69 additions and 45 deletions

View File

@ -16,6 +16,9 @@ Bug fixes
* LUCENE-8726: ValueSource.asDoubleValuesSource() could leak a reference to * LUCENE-8726: ValueSource.asDoubleValuesSource() could leak a reference to
IndexSearcher (Alan Woodward, Yury Pakhomov) IndexSearcher (Alan Woodward, Yury Pakhomov)
* LUCENE-8719: FixedShingleFilter can miss shingles at the end of a token stream if
there are multiple paths with different lengths. (Alan Woodward)
Improvements Improvements
* LUCENE-8673: Use radix partitioning when merging dimensional points instead * LUCENE-8673: Use radix partitioning when merging dimensional points instead

View File

@ -92,63 +92,67 @@ public final class FixedShingleFilter extends GraphTokenFilter {
@Override @Override
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
int shinglePosInc; int shinglePosInc, startOffset, endOffset;
if (incrementGraph() == false) {
if (incrementBaseToken() == false) {
return false;
}
// starting a shingle at a new base position, use base position increment
shinglePosInc = incAtt.getPositionIncrement();
}
else {
// starting a new shingle at the same base with a different graph, use a 0
// position increment
shinglePosInc = 0;
}
final int startOffset = offsetAtt.startOffset(); outer: while (true) {
int endOffset = offsetAtt.endOffset(); if (incrementGraph() == false) {
this.buffer.setEmpty(); if (incrementBaseToken() == false) {
this.buffer.append(termAtt);
// build the shingle by iterating over the current graph, adding
// filler tokens if we encounter gaps
for (int i = 1; i < shingleSize; i++) {
if (incrementGraphToken() == false) {
// we've reached the end of the token stream, check for trailing
// positions and add fillers if necessary
int trailingPositions = getTrailingPositions();
if (i + trailingPositions < shingleSize) {
// not enough trailing positions to make a full shingle
return false; return false;
} }
while (i < shingleSize) { // starting a shingle at a new base position, use base position increment
this.buffer.append(tokenSeparator).append(fillerToken); shinglePosInc = incAtt.getPositionIncrement();
i++; } else {
} // starting a new shingle at the same base with a different graph, use a 0
break; // position increment
shinglePosInc = 0;
} }
int posInc = incAtt.getPositionIncrement();
if (posInc > 1) { startOffset = offsetAtt.startOffset();
// if we have a posInc > 1, we need to fill in the gaps endOffset = offsetAtt.endOffset();
if (i + posInc > shingleSize) { this.buffer.setEmpty();
// if the posInc is greater than the shingle size, we need to add fillers this.buffer.append(termAtt);
// up to the shingle size but no further
// build the shingle by iterating over the current graph, adding
// filler tokens if we encounter gaps
for (int i = 1; i < shingleSize; i++) {
if (incrementGraphToken() == false) {
// we've reached the end of the token stream, check for trailing
// positions and add fillers if necessary
int trailingPositions = getTrailingPositions();
if (i + trailingPositions < shingleSize) {
// not enough trailing positions to make a full shingle
// start again at a different graph
continue outer;
}
while (i < shingleSize) { while (i < shingleSize) {
this.buffer.append(tokenSeparator).append(fillerToken); this.buffer.append(tokenSeparator).append(fillerToken);
i++; i++;
} }
break; break;
} }
// otherwise just add them in as far as we need int posInc = incAtt.getPositionIncrement();
while (posInc > 1) { if (posInc > 1) {
this.buffer.append(tokenSeparator).append(fillerToken); // if we have a posInc > 1, we need to fill in the gaps
posInc--; if (i + posInc > shingleSize) {
i++; // if the posInc is greater than the shingle size, we need to add fillers
// up to the shingle size but no further
while (i < shingleSize) {
this.buffer.append(tokenSeparator).append(fillerToken);
i++;
}
break;
}
// otherwise just add them in as far as we need
while (posInc > 1) {
this.buffer.append(tokenSeparator).append(fillerToken);
posInc--;
i++;
}
} }
this.buffer.append(tokenSeparator).append(termAtt);
endOffset = offsetAtt.endOffset();
} }
this.buffer.append(tokenSeparator).append(termAtt); break;
endOffset = offsetAtt.endOffset();
} }
clearAttributes(); clearAttributes();
this.offsetAtt.setOffset(startOffset, endOffset); this.offsetAtt.setOffset(startOffset, endOffset);

View File

@ -199,6 +199,23 @@ public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
new int[] { 1, 0, 0, 0, 1, 0, }); new int[] { 1, 0, 0, 0, 1, 0, });
} }
public void testTrailingGraphsOfDifferingLengths() throws IOException {
// a b:3/c d e f
TokenStream ts = new CannedTokenStream(
new Token("a", 0, 1),
new Token("b", 1, 2, 3, 3),
new Token("c", 0, 2, 3),
new Token("d", 2, 3),
new Token("e", 2, 3),
new Token("f", 4, 5)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[]{ "a b f", "a c d", "c d e", "d e f"});
}
public void testParameterLimits() { public void testParameterLimits() {
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> { IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
new FixedShingleFilter(new CannedTokenStream(), 1); new FixedShingleFilter(new CannedTokenStream(), 1);