diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c821eeb507d..cf9f100d3ca 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -28,6 +28,9 @@ Bug fixes * LUCENE-8726: ValueSource.asDoubleValuesSource() could leak a reference to IndexSearcher (Alan Woodward, Yury Pakhomov) +* LUCENE-8719: FixedShingleFilter can miss shingles at the end of a token stream if + there are multiple paths with different lengths. (Alan Woodward) + Improvements * LUCENE-8673: Use radix partitioning when merging dimensional points instead diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java index 8f7eb95071c..77be29bfb2e 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java @@ -92,63 +92,67 @@ public final class FixedShingleFilter extends GraphTokenFilter { @Override public boolean incrementToken() throws IOException { - int shinglePosInc; - if (incrementGraph() == false) { - if (incrementBaseToken() == false) { - return false; - } - // starting a shingle at a new base position, use base position increment - shinglePosInc = incAtt.getPositionIncrement(); - } - else { - // starting a new shingle at the same base with a different graph, use a 0 - // position increment - shinglePosInc = 0; - } + int shinglePosInc, startOffset, endOffset; - final int startOffset = offsetAtt.startOffset(); - int endOffset = offsetAtt.endOffset(); - this.buffer.setEmpty(); - this.buffer.append(termAtt); - - // build the shingle by iterating over the current graph, adding - // filler tokens if we encounter gaps - for (int i = 1; i < shingleSize; i++) { - if (incrementGraphToken() == false) { - // we've reached the end of the token stream, check for trailing - // positions and add fillers if necessary - int trailingPositions = getTrailingPositions(); - if (i + trailingPositions < shingleSize) { - // not enough trailing positions to make a full shingle + outer: while (true) { + if (incrementGraph() == false) { + if (incrementBaseToken() == false) { return false; } - while (i < shingleSize) { - this.buffer.append(tokenSeparator).append(fillerToken); - i++; - } - break; + // starting a shingle at a new base position, use base position increment + shinglePosInc = incAtt.getPositionIncrement(); + } else { + // starting a new shingle at the same base with a different graph, use a 0 + // position increment + shinglePosInc = 0; } - int posInc = incAtt.getPositionIncrement(); - if (posInc > 1) { - // if we have a posInc > 1, we need to fill in the gaps - if (i + posInc > shingleSize) { - // if the posInc is greater than the shingle size, we need to add fillers - // up to the shingle size but no further + + startOffset = offsetAtt.startOffset(); + endOffset = offsetAtt.endOffset(); + this.buffer.setEmpty(); + this.buffer.append(termAtt); + + // build the shingle by iterating over the current graph, adding + // filler tokens if we encounter gaps + for (int i = 1; i < shingleSize; i++) { + if (incrementGraphToken() == false) { + // we've reached the end of the token stream, check for trailing + // positions and add fillers if necessary + int trailingPositions = getTrailingPositions(); + if (i + trailingPositions < shingleSize) { + // not enough trailing positions to make a full shingle + // start again at a different graph + continue outer; + } while (i < shingleSize) { this.buffer.append(tokenSeparator).append(fillerToken); i++; } break; } - // otherwise just add them in as far as we need - while (posInc > 1) { - this.buffer.append(tokenSeparator).append(fillerToken); - posInc--; - i++; + int posInc = incAtt.getPositionIncrement(); + if (posInc > 1) { + // if we have a posInc > 1, we need to fill in the gaps + if (i + posInc > shingleSize) { + // if the posInc is greater than the shingle size, we need to add fillers + // up to the shingle size but no further + while (i < shingleSize) { + this.buffer.append(tokenSeparator).append(fillerToken); + i++; + } + break; + } + // otherwise just add them in as far as we need + while (posInc > 1) { + this.buffer.append(tokenSeparator).append(fillerToken); + posInc--; + i++; + } } + this.buffer.append(tokenSeparator).append(termAtt); + endOffset = offsetAtt.endOffset(); } - this.buffer.append(tokenSeparator).append(termAtt); - endOffset = offsetAtt.endOffset(); + break; } clearAttributes(); this.offsetAtt.setOffset(startOffset, endOffset); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java index 85c7dc6d14e..3c7ffe07521 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java @@ -199,6 +199,23 @@ public class FixedShingleFilterTest extends BaseTokenStreamTestCase { new int[] { 1, 0, 0, 0, 1, 0, }); } + public void testTrailingGraphsOfDifferingLengths() throws IOException { + + // a b:3/c d e f + TokenStream ts = new CannedTokenStream( + new Token("a", 0, 1), + new Token("b", 1, 2, 3, 3), + new Token("c", 0, 2, 3), + new Token("d", 2, 3), + new Token("e", 2, 3), + new Token("f", 4, 5) + ); + + assertTokenStreamContents(new FixedShingleFilter(ts, 3), + new String[]{ "a b f", "a c d", "c d e", "d e f"}); + + } + public void testParameterLimits() { IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> { new FixedShingleFilter(new CannedTokenStream(), 1);