mirror of https://github.com/apache/lucene.git
LUCENE-8650: Fix end() and reset() in ConcatenatingTokenStream
This commit is contained in:
parent
4c565e5590
commit
f062a18ae7
|
@ -283,6 +283,10 @@ Bug fixes:
|
|||
* LUCENE-8654: Polygon2D#relateTriangle returns the wrong answer if polygon is inside
|
||||
the triangle. (Ignacio Vera)
|
||||
|
||||
* LUCENE-8650: ConcatenatingTokenStream did not correctly clear its state in reset(), and
|
||||
was not propagating final position increments from its child streams correctly.
|
||||
(Dan Meehl, Alan Woodward)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-8026: ExitableDirectoryReader may now time out queries that run on
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.util.Iterator;
|
|||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -39,10 +40,13 @@ public final class ConcatenatingTokenStream extends TokenStream {
|
|||
|
||||
private final TokenStream[] sources;
|
||||
private final OffsetAttribute[] sourceOffsets;
|
||||
private final PositionIncrementAttribute[] sourceIncrements;
|
||||
private final OffsetAttribute offsetAtt;
|
||||
private final PositionIncrementAttribute posIncAtt;
|
||||
|
||||
private int currentSource;
|
||||
private int offsetIncrement;
|
||||
private int initialPositionIncrement = 1;
|
||||
|
||||
/**
|
||||
* Create a new ConcatenatingTokenStream from a set of inputs
|
||||
|
@ -52,9 +56,12 @@ public final class ConcatenatingTokenStream extends TokenStream {
|
|||
super(combineSources(sources));
|
||||
this.sources = sources;
|
||||
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
this.posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
this.sourceOffsets = new OffsetAttribute[sources.length];
|
||||
this.sourceIncrements = new PositionIncrementAttribute[sources.length];
|
||||
for (int i = 0; i < sources.length; i++) {
|
||||
this.sourceOffsets[i] = sources[i].addAttribute(OffsetAttribute.class);
|
||||
this.sourceIncrements[i] = sources[i].addAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -78,19 +85,26 @@ public final class ConcatenatingTokenStream extends TokenStream {
|
|||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
boolean newSource = false;
|
||||
while (sources[currentSource].incrementToken() == false) {
|
||||
if (currentSource >= sources.length - 1)
|
||||
return false;
|
||||
sources[currentSource].end();
|
||||
initialPositionIncrement = sourceIncrements[currentSource].getPositionIncrement();
|
||||
OffsetAttribute att = sourceOffsets[currentSource];
|
||||
if (att != null)
|
||||
offsetIncrement += att.endOffset();
|
||||
currentSource++;
|
||||
newSource = true;
|
||||
}
|
||||
|
||||
clearAttributes();
|
||||
sources[currentSource].copyTo(this);
|
||||
offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement);
|
||||
if (newSource) {
|
||||
int posInc = posIncAtt.getPositionIncrement();
|
||||
posIncAtt.setPositionIncrement(posInc + initialPositionIncrement);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -98,7 +112,11 @@ public final class ConcatenatingTokenStream extends TokenStream {
|
|||
@Override
|
||||
public void end() throws IOException {
|
||||
sources[currentSource].end();
|
||||
int finalOffset = sourceOffsets[currentSource].endOffset() + offsetIncrement;
|
||||
int finalPosInc = sourceIncrements[currentSource].getPositionIncrement();
|
||||
super.end();
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
posIncAtt.setPositionIncrement(finalPosInc);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -107,6 +125,8 @@ public final class ConcatenatingTokenStream extends TokenStream {
|
|||
source.reset();
|
||||
}
|
||||
super.reset();
|
||||
currentSource = 0;
|
||||
offsetIncrement = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -21,7 +21,9 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
|
@ -46,6 +48,33 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {
|
|||
new int[]{ 0, 6, 12, 19, 25, 31 },
|
||||
new int[]{ 5, 11, 18, 24, 30, 36 });
|
||||
|
||||
// test re-use
|
||||
first.setReader(new StringReader("first words "));
|
||||
second.setReader(new StringReader("second words"));
|
||||
third.setReader(new StringReader(" third words"));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "first", "words", "second", "words", "third", "words" },
|
||||
new int[]{ 0, 6, 12, 19, 25, 31 },
|
||||
new int[]{ 5, 11, 18, 24, 30, 36 },
|
||||
new int[]{ 1, 1, 1, 1, 1, 1 });
|
||||
|
||||
}
|
||||
|
||||
public void testOffsetGaps() throws IOException {
|
||||
CannedTokenStream cts1 = new CannedTokenStream(2, 10,
|
||||
new Token("a", 0, 1), new Token("b", 2, 3));
|
||||
CannedTokenStream cts2 = new CannedTokenStream(2, 10,
|
||||
new Token("c", 0, 1), new Token("d", 2, 3));
|
||||
|
||||
TokenStream ts = new ConcatenatingTokenStream(cts1, cts2);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "a", "b", "c", "d" },
|
||||
new int[]{ 0, 2, 10, 12 },
|
||||
new int[]{ 1, 3, 11, 13 },
|
||||
null,
|
||||
new int[]{ 1, 1, 3, 1 },
|
||||
null, 20, 2, null, false, null
|
||||
);
|
||||
}
|
||||
|
||||
public void testInconsistentAttributes() throws IOException {
|
||||
|
|
Loading…
Reference in New Issue