mirror of
https://github.com/apache/lucene.git
synced 2025-02-15 22:45:52 +00:00
LUCENE-8719: Traverse all paths at the end of a TokenStream in FixedShingleFilter
This commit is contained in:
parent
003edafdc8
commit
6571d06c25
@ -16,6 +16,9 @@ Bug fixes
|
|||||||
* LUCENE-8726: ValueSource.asDoubleValuesSource() could leak a reference to
|
* LUCENE-8726: ValueSource.asDoubleValuesSource() could leak a reference to
|
||||||
IndexSearcher (Alan Woodward, Yury Pakhomov)
|
IndexSearcher (Alan Woodward, Yury Pakhomov)
|
||||||
|
|
||||||
|
* LUCENE-8719: FixedShingleFilter can miss shingles at the end of a token stream if
|
||||||
|
there are multiple paths with different lengths. (Alan Woodward)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-8673: Use radix partitioning when merging dimensional points instead
|
* LUCENE-8673: Use radix partitioning when merging dimensional points instead
|
||||||
|
@ -92,63 +92,67 @@ public final class FixedShingleFilter extends GraphTokenFilter {
|
|||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
|
|
||||||
int shinglePosInc;
|
int shinglePosInc, startOffset, endOffset;
|
||||||
if (incrementGraph() == false) {
|
|
||||||
if (incrementBaseToken() == false) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// starting a shingle at a new base position, use base position increment
|
|
||||||
shinglePosInc = incAtt.getPositionIncrement();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// starting a new shingle at the same base with a different graph, use a 0
|
|
||||||
// position increment
|
|
||||||
shinglePosInc = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
final int startOffset = offsetAtt.startOffset();
|
outer: while (true) {
|
||||||
int endOffset = offsetAtt.endOffset();
|
if (incrementGraph() == false) {
|
||||||
this.buffer.setEmpty();
|
if (incrementBaseToken() == false) {
|
||||||
this.buffer.append(termAtt);
|
|
||||||
|
|
||||||
// build the shingle by iterating over the current graph, adding
|
|
||||||
// filler tokens if we encounter gaps
|
|
||||||
for (int i = 1; i < shingleSize; i++) {
|
|
||||||
if (incrementGraphToken() == false) {
|
|
||||||
// we've reached the end of the token stream, check for trailing
|
|
||||||
// positions and add fillers if necessary
|
|
||||||
int trailingPositions = getTrailingPositions();
|
|
||||||
if (i + trailingPositions < shingleSize) {
|
|
||||||
// not enough trailing positions to make a full shingle
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
while (i < shingleSize) {
|
// starting a shingle at a new base position, use base position increment
|
||||||
this.buffer.append(tokenSeparator).append(fillerToken);
|
shinglePosInc = incAtt.getPositionIncrement();
|
||||||
i++;
|
} else {
|
||||||
}
|
// starting a new shingle at the same base with a different graph, use a 0
|
||||||
break;
|
// position increment
|
||||||
|
shinglePosInc = 0;
|
||||||
}
|
}
|
||||||
int posInc = incAtt.getPositionIncrement();
|
|
||||||
if (posInc > 1) {
|
startOffset = offsetAtt.startOffset();
|
||||||
// if we have a posInc > 1, we need to fill in the gaps
|
endOffset = offsetAtt.endOffset();
|
||||||
if (i + posInc > shingleSize) {
|
this.buffer.setEmpty();
|
||||||
// if the posInc is greater than the shingle size, we need to add fillers
|
this.buffer.append(termAtt);
|
||||||
// up to the shingle size but no further
|
|
||||||
|
// build the shingle by iterating over the current graph, adding
|
||||||
|
// filler tokens if we encounter gaps
|
||||||
|
for (int i = 1; i < shingleSize; i++) {
|
||||||
|
if (incrementGraphToken() == false) {
|
||||||
|
// we've reached the end of the token stream, check for trailing
|
||||||
|
// positions and add fillers if necessary
|
||||||
|
int trailingPositions = getTrailingPositions();
|
||||||
|
if (i + trailingPositions < shingleSize) {
|
||||||
|
// not enough trailing positions to make a full shingle
|
||||||
|
// start again at a different graph
|
||||||
|
continue outer;
|
||||||
|
}
|
||||||
while (i < shingleSize) {
|
while (i < shingleSize) {
|
||||||
this.buffer.append(tokenSeparator).append(fillerToken);
|
this.buffer.append(tokenSeparator).append(fillerToken);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// otherwise just add them in as far as we need
|
int posInc = incAtt.getPositionIncrement();
|
||||||
while (posInc > 1) {
|
if (posInc > 1) {
|
||||||
this.buffer.append(tokenSeparator).append(fillerToken);
|
// if we have a posInc > 1, we need to fill in the gaps
|
||||||
posInc--;
|
if (i + posInc > shingleSize) {
|
||||||
i++;
|
// if the posInc is greater than the shingle size, we need to add fillers
|
||||||
|
// up to the shingle size but no further
|
||||||
|
while (i < shingleSize) {
|
||||||
|
this.buffer.append(tokenSeparator).append(fillerToken);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// otherwise just add them in as far as we need
|
||||||
|
while (posInc > 1) {
|
||||||
|
this.buffer.append(tokenSeparator).append(fillerToken);
|
||||||
|
posInc--;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
this.buffer.append(tokenSeparator).append(termAtt);
|
||||||
|
endOffset = offsetAtt.endOffset();
|
||||||
}
|
}
|
||||||
this.buffer.append(tokenSeparator).append(termAtt);
|
break;
|
||||||
endOffset = offsetAtt.endOffset();
|
|
||||||
}
|
}
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
this.offsetAtt.setOffset(startOffset, endOffset);
|
this.offsetAtt.setOffset(startOffset, endOffset);
|
||||||
|
@ -199,6 +199,23 @@ public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
|
|||||||
new int[] { 1, 0, 0, 0, 1, 0, });
|
new int[] { 1, 0, 0, 0, 1, 0, });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testTrailingGraphsOfDifferingLengths() throws IOException {
|
||||||
|
|
||||||
|
// a b:3/c d e f
|
||||||
|
TokenStream ts = new CannedTokenStream(
|
||||||
|
new Token("a", 0, 1),
|
||||||
|
new Token("b", 1, 2, 3, 3),
|
||||||
|
new Token("c", 0, 2, 3),
|
||||||
|
new Token("d", 2, 3),
|
||||||
|
new Token("e", 2, 3),
|
||||||
|
new Token("f", 4, 5)
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
|
||||||
|
new String[]{ "a b f", "a c d", "c d e", "d e f"});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public void testParameterLimits() {
|
public void testParameterLimits() {
|
||||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
|
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
|
||||||
new FixedShingleFilter(new CannedTokenStream(), 1);
|
new FixedShingleFilter(new CannedTokenStream(), 1);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user